diff options
92 files changed, 2048 insertions, 2197 deletions
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 282de3680367..c57e5b7cb532 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -336,18 +336,11 @@ What: /sys/block/<disk>/queue/io_poll_delay Date: November 2016 Contact: linux-block@vger.kernel.org Description: - [RW] If polling is enabled, this controls what kind of polling - will be performed. It defaults to -1, which is classic polling. + [RW] This was used to control what kind of polling will be + performed. It is now fixed to -1, which is classic polling. In this mode, the CPU will repeatedly ask for completions - without giving up any time. If set to 0, a hybrid polling mode - is used, where the kernel will attempt to make an educated guess - at when the IO will complete. Based on this guess, the kernel - will put the process issuing IO to sleep for an amount of time, - before entering a classic poll loop. This mode might be a little - slower than pure classic polling, but it will be more efficient. - If set to a value larger than 0, the kernel will put the process - issuing IO to sleep for this amount of microseconds before - entering classic polling. + without giving up any time. + <deprecated> What: /sys/block/<disk>/queue/io_timeout diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index f9bf18ea6509..90b733422ed4 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -270,8 +270,7 @@ Request queue based layered devices like dm-rq that wish to support inline encryption need to create their own blk_crypto_profile for their request_queue, and expose whatever functionality they choose. When a layered device wants to pass a clone of that request to another request_queue, blk-crypto will -initialize and prepare the clone as necessary; see -``blk_crypto_insert_cloned_request()``. +initialize and prepare the clone as necessary. Interaction between inline encryption and blk integrity ======================================================= diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 89ffb3aa992c..74f7d051665b 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -497,15 +497,9 @@ static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) bgd = kzalloc(sizeof(*bgd), gfp); if (!bgd) return NULL; - return &bgd->pd; -} -static void bfq_cpd_init(struct blkcg_policy_data *cpd) -{ - struct bfq_group_data *d = cpd_to_bfqgd(cpd); - - d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? - CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; + bgd->weight = CGROUP_WEIGHT_DFL; + return &bgd->pd; } static void bfq_cpd_free(struct blkcg_policy_data *cpd) @@ -1301,8 +1295,6 @@ struct blkcg_policy blkcg_policy_bfq = { .legacy_cftypes = bfq_blkcg_legacy_files, .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, .cpd_free_fn = bfq_cpd_free, .pd_alloc_fn = bfq_pd_alloc, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 69aaee52285a..467e8cfc41a2 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -20,7 +20,6 @@ #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_WEIGHT_LEGACY_DFL 100 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE diff --git a/block/bio.c b/block/bio.c index fd11614bba4d..fc98c1c723ca 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1168,7 +1168,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) bio_for_each_segment_all(bvec, bio, iter_all) { if (mark_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + bio_release_page(bio, bvec->bv_page); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1190,7 +1190,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = size; - bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); } @@ -1205,7 +1204,7 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, } if (same_page) - put_page(page); + bio_release_page(bio, page); return 0; } @@ -1219,7 +1218,7 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; if (same_page) - put_page(page); + bio_release_page(bio, page); return 0; } @@ -1230,10 +1229,10 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. - * For multi-segment *iter, this function only adds pages from the - * next non-empty segment of the iov iterator. + * Extracts pages from *iter and appends them to @bio's bvec array. The pages + * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. + * For a multi-segment *iter, this function only adds pages from the next + * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1265,9 +1264,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. */ - size = iov_iter_get_pages(iter, pages, - UINT_MAX - bio->bi_iter.bi_size, - nr_pages, &offset, extraction_flags); + size = iov_iter_extract_pages(iter, &pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; @@ -1300,7 +1299,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) iov_iter_revert(iter, left); out: while (i < nr_pages) - put_page(pages[i++]); + bio_release_page(bio, pages[i++]); return ret; } @@ -1335,6 +1334,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); @@ -1488,8 +1489,8 @@ void bio_set_pages_dirty(struct bio *bio) * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one put_page() against each page and will run one - * bio_put() against the BIO. + * here on. It will unpin each page and will run one bio_put() against the + * BIO. */ static void bio_dirty_fn(struct work_struct *work); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 18c922579719..5fa77f32a52b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1289,8 +1289,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; - if (pol->cpd_init_fn) - pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); @@ -1395,26 +1393,6 @@ void blkcg_exit_disk(struct gendisk *disk) blk_throtl_exit(disk); } -static void blkcg_bind(struct cgroup_subsys_state *root_css) -{ - int i; - - mutex_lock(&blkcg_pol_mutex); - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - struct blkcg *blkcg; - - if (!pol || !pol->cpd_bind_fn) - continue; - - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) - if (blkcg->cpd[pol->plid]) - pol->cpd_bind_fn(blkcg->cpd[pol->plid]); - } - mutex_unlock(&blkcg_pol_mutex); -} - static void blkcg_exit(struct task_struct *tsk) { if (tsk->throttle_disk) @@ -1428,7 +1406,6 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, - .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", @@ -1666,8 +1643,6 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; - if (pol->cpd_init_fn) - pol->cpd_init_fn(cpd); } } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index e98d2c1be354..6794157ea1eb 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -174,9 +174,7 @@ struct blkcg_policy { /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; - blkcg_pol_init_cpd_fn *cpd_init_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; - blkcg_pol_bind_cpd_fn *cpd_bind_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; diff --git a/block/blk-core.c b/block/blk-core.c index 478978dcb2bd..cf0e2a87de60 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) static void blk_free_queue(struct request_queue *q) { - if (q->poll_stat) - blk_stat_remove_callback(q, q->poll_cb); - blk_stat_free_callback(q->poll_cb); - blk_free_queue_stats(q->stats); - kfree(q->poll_stat); - if (queue_is_mq(q)) blk_mq_release(q); diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index a8cdaf26851e..93a141979694 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -65,6 +65,11 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return rq->crypt_ctx; } +static inline bool blk_crypto_rq_has_keyslot(struct request *rq) +{ + return rq->crypt_keyslot; +} + blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, struct blk_crypto_keyslot **slot_ptr); @@ -119,6 +124,11 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return false; } +static inline bool blk_crypto_rq_has_keyslot(struct request *rq) +{ + return false; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); @@ -153,14 +163,21 @@ static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) return true; } -blk_status_t __blk_crypto_init_request(struct request *rq); -static inline blk_status_t blk_crypto_init_request(struct request *rq) +blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); +static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) - return __blk_crypto_init_request(rq); + return __blk_crypto_rq_get_keyslot(rq); return BLK_STS_OK; } +void __blk_crypto_rq_put_keyslot(struct request *rq); +static inline void blk_crypto_rq_put_keyslot(struct request *rq) +{ + if (blk_crypto_rq_has_keyslot(rq)) + __blk_crypto_rq_put_keyslot(rq); +} + void __blk_crypto_free_request(struct request *rq); static inline void blk_crypto_free_request(struct request *rq) { @@ -188,21 +205,6 @@ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, return 0; } -/** - * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted - * into a request queue. - * @rq: the request being queued - * - * Return: BLK_STS_OK on success, nonzero on error. - */ -static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq) -{ - - if (blk_crypto_rq_is_encrypted(rq)) - return blk_crypto_init_request(rq); - return BLK_STS_OK; -} - #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 0307fb0d95d3..2a67d3fb63e5 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -227,14 +227,13 @@ EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index); * @profile: the crypto profile of the device the key will be used on * @key: the key that will be used * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct - * will be stored here; otherwise NULL will be stored here. + * will be stored here. blk_crypto_put_keyslot() must be called + * later to release it. Otherwise, NULL will be stored here. * * If the device has keyslots, this gets a keyslot that's been programmed with * the specified key. If the key is already in a slot, this reuses it; * otherwise this waits for a slot to become idle and programs the key into it. * - * This must be paired with a call to blk_crypto_put_keyslot(). - * * Context: Process context. Takes and releases profile->lock. * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or * one wasn't needed; or a blk_status_t error on failure. @@ -312,20 +311,15 @@ success: /** * blk_crypto_put_keyslot() - Release a reference to a keyslot - * @slot: The keyslot to release the reference of (may be NULL). + * @slot: The keyslot to release the reference of * * Context: Any context. */ void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot) { - struct blk_crypto_profile *profile; + struct blk_crypto_profile *profile = slot->profile; unsigned long flags; - if (!slot) - return; - - profile = slot->profile; - if (atomic_dec_and_lock_irqsave(&slot->slot_refs, &profile->idle_slots_lock, flags)) { list_add_tail(&slot->idle_slot_node, &profile->idle_slots); @@ -354,28 +348,16 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, return true; } -/** - * __blk_crypto_evict_key() - Evict a key from a device. - * @profile: the crypto profile of the device - * @key: the key to evict. It must not still be used in any I/O. - * - * If the device has keyslots, this finds the keyslot (if any) that contains the - * specified key and calls the driver's keyslot_evict function to evict it. - * - * Otherwise, this just calls the driver's keyslot_evict function if it is - * implemented, passing just the key (without any particular keyslot). This - * allows layered devices to evict the key from their underlying devices. - * - * Context: Process context. Takes and releases profile->lock. - * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY - * if the keyslot is still in use, or another -errno value on other - * error. +/* + * This is an internal function that evicts a key from an inline encryption + * device that can be either a real device or the blk-crypto-fallback "device". + * It is used only by blk_crypto_evict_key(); see that function for details. */ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, const struct blk_crypto_key *key) { struct blk_crypto_keyslot *slot; - int err = 0; + int err; if (profile->num_slots == 0) { if (profile->ll_ops.keyslot_evict) { @@ -389,22 +371,30 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, blk_crypto_hw_enter(profile); slot = blk_crypto_find_keyslot(profile, key); - if (!slot) - goto out_unlock; + if (!slot) { + /* + * Not an error, since a key not in use by I/O is not guaranteed + * to be in a keyslot. There can be more keys than keyslots. + */ + err = 0; + goto out; + } if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + /* BUG: key is still in use by I/O */ err = -EBUSY; - goto out_unlock; + goto out_remove; } err = profile->ll_ops.keyslot_evict(profile, key, blk_crypto_keyslot_index(slot)); - if (err) - goto out_unlock; - +out_remove: + /* + * Callers free the key even on error, so unlink the key from the hash + * table and clear slot->key even on error. + */ hlist_del(&slot->hash_node); slot->key = NULL; - err = 0; -out_unlock: +out: blk_crypto_hw_exit(profile); return err; } diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 45378586151f..4d760b092deb 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/blk-crypto-profile.h> #include <linux/module.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include "blk-crypto-internal.h" @@ -224,27 +225,27 @@ static bool bio_crypt_check_alignment(struct bio *bio) return true; } -blk_status_t __blk_crypto_init_request(struct request *rq) +blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) { return blk_crypto_get_keyslot(rq->q->crypto_profile, rq->crypt_ctx->bc_key, &rq->crypt_keyslot); } -/** - * __blk_crypto_free_request - Uninitialize the crypto fields of a request. - * - * @rq: The request whose crypto fields to uninitialize. - * - * Completely uninitializes the crypto fields of a request. If a keyslot has - * been programmed into some inline encryption hardware, that keyslot is - * released. The rq->crypt_ctx is also freed. - */ -void __blk_crypto_free_request(struct request *rq) +void __blk_crypto_rq_put_keyslot(struct request *rq) { blk_crypto_put_keyslot(rq->crypt_keyslot); + rq->crypt_keyslot = NULL; +} + +void __blk_crypto_free_request(struct request *rq) +{ + /* The keyslot, if one was needed, should have been released earlier. */ + if (WARN_ON_ONCE(rq->crypt_keyslot)) + __blk_crypto_rq_put_keyslot(rq); + mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); - blk_crypto_rq_set_defaults(rq); + rq->crypt_ctx = NULL; } /** @@ -399,30 +400,39 @@ int blk_crypto_start_using_key(struct block_device *bdev, } /** - * blk_crypto_evict_key() - Evict a key from any inline encryption hardware - * it may have been programmed into - * @bdev: The block_device who's associated inline encryption hardware this key - * might have been programmed into - * @key: The key to evict + * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device + * @bdev: a block_device on which I/O using the key may have been done + * @key: the key to evict + * + * For a given block_device, this function removes the given blk_crypto_key from + * the keyslot management structures and evicts it from any underlying hardware + * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into. * - * Upper layers (filesystems) must call this function to ensure that a key is - * evicted from any hardware that it might have been programmed into. The key - * must not be in use by any in-flight IO when this function is called. + * Upper layers must call this before freeing the blk_crypto_key. It must be + * called for every block_device the key may have been used on. The key must no + * longer be in use by any I/O when this function is called. * - * Return: 0 on success or if the key wasn't in any keyslot; -errno on error. + * Context: May sleep. */ -int blk_crypto_evict_key(struct block_device *bdev, - const struct blk_crypto_key *key) +void blk_crypto_evict_key(struct block_device *bdev, + const struct blk_crypto_key *key) { struct request_queue *q = bdev_get_queue(bdev); + int err; if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) - return __blk_crypto_evict_key(q->crypto_profile, key); - + err = __blk_crypto_evict_key(q->crypto_profile, key); + else + err = blk_crypto_fallback_evict_key(key); /* - * If the block_device didn't support the key, then blk-crypto-fallback - * may have been used, so try to evict the key from blk-crypto-fallback. + * An error can only occur here if the key failed to be evicted from a + * keyslot (due to a hardware or driver issue) or is allegedly still in + * use by I/O (due to a kernel bug). Even in these cases, the key is + * still unlinked from the keyslot management structures, and the caller + * is allowed and expected to free it right away. There's nothing + * callers can do to handle errors, so just log them and return void. */ - return blk_crypto_fallback_evict_key(key); + if (err) + pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); diff --git a/block/blk-map.c b/block/blk-map.c index 9137d16cecdc..3551c3ff17cf 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -29,10 +29,11 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); if (!bmd) return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); bmd->iter = *data; - if (iter_is_iovec(data)) - bmd->iter.iov = bmd->iov; + if (iter_is_iovec(data)) { + memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs); + bmd->iter.__iov = bmd->iov; + } return bmd; } @@ -280,21 +281,21 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (blk_queue_pci_p2pdma(rq->q)) extraction_flags |= ITER_ALLOW_P2PDMA; + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); while (iov_iter_count(iter)) { - struct page **pages, *stack_pages[UIO_FASTIOV]; + struct page *stack_pages[UIO_FASTIOV]; + struct page **pages = stack_pages; ssize_t bytes; size_t offs; int npages; - if (nr_vecs <= ARRAY_SIZE(stack_pages)) { - pages = stack_pages; - bytes = iov_iter_get_pages(iter, pages, LONG_MAX, - nr_vecs, &offs, extraction_flags); - } else { - bytes = iov_iter_get_pages_alloc(iter, &pages, - LONG_MAX, &offs, extraction_flags); - } + if (nr_vecs > ARRAY_SIZE(stack_pages)) + pages = NULL; + + bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX, + nr_vecs, extraction_flags, &offs); if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -316,7 +317,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!bio_add_hw_page(rq->q, bio, page, n, offs, max_sectors, &same_page)) { if (same_page) - put_page(page); + bio_release_page(bio, page); break; } @@ -328,7 +329,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, * release the pages we didn't map into the bio, if any */ while (j < npages) - put_page(pages[j++]); + bio_release_page(bio, pages[j++]); if (pages != stack_pages) kvfree(pages); /* couldn't stuff something into bio? */ diff --git a/block/blk-merge.c b/block/blk-merge.c index 6460abdb2426..65e75efa9bd3 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -867,6 +867,8 @@ static struct request *attempt_merge(struct request_queue *q, if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next); + blk_crypto_rq_put_keyslot(next); + /* * 'next' is going away, so update stats accordingly */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b01818f8e216..212a7f301e73 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -15,33 +15,8 @@ #include "blk-mq-tag.h" #include "blk-rq-qos.h" -static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) -{ - if (stat->nr_samples) { - seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu", - stat->nr_samples, stat->mean, stat->min, stat->max); - } else { - seq_puts(m, "samples=0"); - } -} - static int queue_poll_stat_show(void *data, struct seq_file *m) { - struct request_queue *q = data; - int bucket; - - if (!q->poll_stat) - return 0; - - for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { - seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); - print_stat(m, &q->poll_stat[2 * bucket]); - seq_puts(m, "\n"); - - seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket)); - print_stat(m, &q->poll_stat[2 * bucket + 1]); - seq_puts(m, "\n"); - } return 0; } @@ -282,7 +257,6 @@ static const char *const rqf_name[] = { RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), - RQF_NAME(MQ_POLL_SLEPT), RQF_NAME(TIMED_OUT), RQF_NAME(ELV), RQF_NAME(RESV), diff --git a/block/blk-mq.c b/block/blk-mq.c index f0ea9dcfb966..3b75de43293c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -46,51 +46,15 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); -static void blk_mq_poll_stats_start(struct request_queue *q); -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); - -static int blk_mq_poll_stats_bkt(const struct request *rq) -{ - int ddir, sectors, bucket; - - ddir = rq_data_dir(rq); - sectors = blk_rq_stats_sectors(rq); - - bucket = ddir + 2 * ilog2(sectors); - - if (bucket < 0) - return -1; - else if (bucket >= BLK_MQ_POLL_STATS_BKTS) - return ddir + BLK_MQ_POLL_STATS_BKTS - 2; - - return bucket; -} - -#define BLK_QC_T_SHIFT 16 -#define BLK_QC_T_INTERNAL (1U << 31) - static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { - return xa_load(&q->hctx_table, - (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT); -} - -static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, - blk_qc_t qc) -{ - unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1); - - if (qc & BLK_QC_T_INTERNAL) - return blk_mq_tag_to_rq(hctx->sched_tags, tag); - return blk_mq_tag_to_rq(hctx->tags, tag); + return xa_load(&q->hctx_table, qc); } static inline blk_qc_t blk_rq_to_qc(struct request *rq) { - return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | - (rq->tag != -1 ? - rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); + return rq->mq_hctx->queue_num; } /* @@ -840,6 +804,12 @@ static void blk_complete_request(struct request *req) req->q->integrity.profile->complete_fn(req, total_bytes); #endif + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + blk_crypto_rq_put_keyslot(req); + blk_account_io_completion(req, total_bytes); do { @@ -905,6 +875,13 @@ bool blk_update_request(struct request *req, blk_status_t error, req->q->integrity.profile->complete_fn(req, nr_bytes); #endif + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) + __blk_crypto_rq_put_keyslot(req); + if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET)) && !test_bit(GD_DEAD, &req->q->disk->state)) { @@ -976,17 +953,6 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); -static void __blk_account_io_done(struct request *req, u64 now) -{ - const int sgrp = op_stat_group(req_op(req)); - - part_stat_lock(); - update_io_ticks(req->part, jiffies, true); - part_stat_inc(req->part, ios[sgrp]); - part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); - part_stat_unlock(); -} - static inline void blk_account_io_done(struct request *req, u64 now) { /* @@ -995,40 +961,41 @@ static inline void blk_account_io_done(struct request *req, u64 now) * containing request is enough. */ if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) - __blk_account_io_done(req, now); -} - -static void __blk_account_io_start(struct request *rq) -{ - /* - * All non-passthrough requests are created from a bio with one - * exception: when a flush command that is part of a flush sequence - * generated by the state machine in blk-flush.c is cloned onto the - * lower device by dm-multipath we can get here without a bio. - */ - if (rq->bio) - rq->part = rq->bio->bi_bdev; - else - rq->part = rq->q->disk->part0; + !(req->rq_flags & RQF_FLUSH_SEQ)) { + const int sgrp = op_stat_group(req_op(req)); - part_stat_lock(); - update_io_ticks(rq->part, jiffies, false); - part_stat_unlock(); + part_stat_lock(); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); + } } static inline void blk_account_io_start(struct request *req) { - if (blk_do_io_stat(req)) - __blk_account_io_start(req); + if (blk_do_io_stat(req)) { + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (req->bio) + req->part = req->bio->bi_bdev; + else + req->part = req->q->disk->part0; + + part_stat_lock(); + update_io_ticks(req->part, jiffies, false); + part_stat_unlock(); + } } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { - if (rq->rq_flags & RQF_STATS) { - blk_mq_poll_stats_start(rq->q); + if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); - } blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); @@ -2966,7 +2933,7 @@ void blk_mq_submit_bio(struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); - ret = blk_crypto_init_request(rq); + ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); @@ -3035,8 +3002,9 @@ blk_status_t blk_insert_cloned_request(struct request *rq) if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; - if (blk_crypto_insert_cloned_request(rq)) - return BLK_STS_IOERR; + ret = blk_crypto_rq_get_keyslot(rq); + if (ret != BLK_STS_OK) + return ret; blk_account_io_start(rq); @@ -4207,14 +4175,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; - q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, - blk_mq_poll_stats_bkt, - BLK_MQ_POLL_STATS_BKTS, q); - if (!q->poll_cb) - goto err_exit; - if (blk_mq_alloc_ctxs(q)) - goto err_poll; + goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); @@ -4242,11 +4204,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_requests = set->queue_depth; - /* - * Default to classic polling - */ - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); @@ -4254,9 +4211,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: blk_mq_release(q); -err_poll: - blk_stat_free_callback(q->poll_cb); - q->poll_cb = NULL; err_exit: q->mq_ops = NULL; return -ENOMEM; @@ -4753,138 +4707,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); -/* Enable polling stats and return whether they were already enabled. */ -static bool blk_poll_stats_enable(struct request_queue *q) -{ - if (q->poll_stat) - return true; - - return blk_stats_alloc_enable(q); -} - -static void blk_mq_poll_stats_start(struct request_queue *q) -{ - /* - * We don't arm the callback if polling stats are not enabled or the - * callback is already active. - */ - if (!q->poll_stat || blk_stat_is_active(q->poll_cb)) - return; - - blk_stat_activate_msecs(q->poll_cb, 100); -} - -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) -{ - struct request_queue *q = cb->data; - int bucket; - - for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { - if (cb->stat[bucket].nr_samples) - q->poll_stat[bucket] = cb->stat[bucket]; - } -} - -static unsigned long blk_mq_poll_nsecs(struct request_queue *q, - struct request *rq) -{ - unsigned long ret = 0; - int bucket; - - /* - * If stats collection isn't on, don't sleep but turn it on for - * future users - */ - if (!blk_poll_stats_enable(q)) - return 0; - - /* - * As an optimistic guess, use half of the mean service time - * for this type of request. We can (and should) make this smarter. - * For instance, if the completion latencies are tight, we can - * get closer than just half the mean. This is especially - * important on devices where the completion latencies are longer - * than ~10 usec. We do use the stats for the relevant IO size - * if available which does lead to better estimates. - */ - bucket = blk_mq_poll_stats_bkt(rq); - if (bucket < 0) - return ret; - - if (q->poll_stat[bucket].nr_samples) - ret = (q->poll_stat[bucket].mean + 1) / 2; - - return ret; -} - -static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) -{ - struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc); - struct request *rq = blk_qc_to_rq(hctx, qc); - struct hrtimer_sleeper hs; - enum hrtimer_mode mode; - unsigned int nsecs; - ktime_t kt; - - /* - * If a request has completed on queue that uses an I/O scheduler, we - * won't get back a request from blk_qc_to_rq. - */ - if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT)) - return false; - - /* - * If we get here, hybrid polling is enabled. Hence poll_nsec can be: - * - * 0: use half of prev avg - * >0: use this specific value - */ - if (q->poll_nsec > 0) - nsecs = q->poll_nsec; - else - nsecs = blk_mq_poll_nsecs(q, rq); - - if (!nsecs) - return false; - - rq->rq_flags |= RQF_MQ_POLL_SLEPT; - - /* - * This will be replaced with the stats tracking code, using - * 'avg_completion_time / 2' as the pre-sleep target. - */ - kt = nsecs; - - mode = HRTIMER_MODE_REL; - hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); - hrtimer_set_expires(&hs.timer, kt); - - do { - if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) - break; - set_current_state(TASK_UNINTERRUPTIBLE); - hrtimer_sleeper_start_expires(&hs, mode); - if (hs.task) - io_schedule(); - hrtimer_cancel(&hs.timer); - mode = HRTIMER_MODE_ABS; - } while (hs.task && !signal_pending(current)); - - __set_current_state(TASK_RUNNING); - destroy_hrtimer_on_stack(&hs.timer); - - /* - * If we sleep, have the caller restart the poll loop to reset the - * state. Like for the other success return cases, the caller is - * responsible for checking if the IO completed. If the IO isn't - * complete, we'll get called again and will go straight to the busy - * poll loop. - */ - return true; -} - -static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, - struct io_comp_batch *iob, unsigned int flags) +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags) { struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); long state = get_current_state(); @@ -4911,17 +4735,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, return 0; } -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, - unsigned int flags) -{ - if (!(flags & BLK_POLL_NOSLEEP) && - q->poll_nsec != BLK_MQ_POLL_CLASSIC) { - if (blk_mq_poll_hybrid(q, cookie)) - return 1; - } - return blk_mq_poll_classic(q, cookie, iob, flags); -} - unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; diff --git a/block/blk-stat.c b/block/blk-stat.c index c6ca16abf911..74a1a8c32d86 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -231,21 +231,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats) kfree(stats); } - -bool blk_stats_alloc_enable(struct request_queue *q) -{ - struct blk_rq_stat *poll_stat; - - poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat), - GFP_ATOMIC); - if (!poll_stat) - return false; - - if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) { - kfree(poll_stat); - return true; - } - - blk_stat_add_callback(q, q->poll_cb); - return false; -} diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f1fce1c7fa44..1a743b4f2958 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -408,35 +408,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { - int val; - - if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) - val = BLK_MQ_POLL_CLASSIC; - else - val = q->poll_nsec / 1000; - - return sprintf(page, "%d\n", val); + return sprintf(page, "%d\n", -1); } static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, size_t count) { - int err, val; - - if (!q->mq_ops || !q->mq_ops->poll) - return -EINVAL; - - err = kstrtoint(page, 10, &val); - if (err < 0) - return err; - - if (val == BLK_MQ_POLL_CLASSIC) - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - else if (val >= 0) - q->poll_nsec = val * 1000; - else - return -EINVAL; - return count; } diff --git a/block/blk.h b/block/blk.h index cc4e8873dfde..d65d96994a94 100644 --- a/block/blk.h +++ b/block/blk.h @@ -432,6 +432,18 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); +/* + * Clean up a page appropriately, where the page may be pinned, may have a + * ref taken on it or neither. + */ +static inline void bio_release_page(struct bio *bio, struct page *page) +{ + if (bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_page(page); + else if (bio_flagged(bio, BIO_PAGE_REFFED)) + put_page(page); +} + struct request_queue *blk_alloc_queue(int node_id); int disk_scan_partitions(struct gendisk *disk, fmode_t mode); diff --git a/block/opal_proto.h b/block/opal_proto.h index 7152aa1f1a49..a4e56845dd82 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -86,6 +86,15 @@ enum opal_response_token { #define OPAL_MSID_KEYLEN 15 #define OPAL_UID_LENGTH_HALF 4 +/* + * Boolean operators from TCG Core spec 2.01 Section: + * 5.1.3.11 + * Table 61 + */ +#define OPAL_BOOLEAN_AND 0 +#define OPAL_BOOLEAN_OR 1 +#define OPAL_BOOLEAN_NOT 2 + /* Enum to index OPALUID array */ enum opal_uid { /* users */ @@ -105,6 +114,7 @@ enum opal_uid { /* tables */ OPAL_TABLE_TABLE, OPAL_LOCKINGRANGE_GLOBAL, + OPAL_LOCKINGRANGE_ACE_START_TO_KEY, OPAL_LOCKINGRANGE_ACE_RDLOCKED, OPAL_LOCKINGRANGE_ACE_WRLOCKED, OPAL_MBRCONTROL, diff --git a/block/sed-opal.c b/block/sed-opal.c index c320093c14f1..3fc4e65db111 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -132,6 +132,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_LOCKINGRANGE_GLOBAL] = { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_START_TO_KEY] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xD0, 0x01 }, [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, [OPAL_LOCKINGRANGE_ACE_WRLOCKED] = @@ -1147,12 +1149,8 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont) return opal_send_recv(dev, cont); } -/* - * request @column from table @table on device @dev. On success, the column - * data will be available in dev->resp->tok[4] - */ -static int generic_get_column(struct opal_dev *dev, const u8 *table, - u64 column) +static int generic_get_columns(struct opal_dev *dev, const u8 *table, + u64 start_column, u64 end_column) { int err; @@ -1162,12 +1160,12 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, add_token_u8(&err, dev, OPAL_STARTNAME); add_token_u8(&err, dev, OPAL_STARTCOLUMN); - add_token_u64(&err, dev, column); + add_token_u64(&err, dev, start_column); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_STARTNAME); add_token_u8(&err, dev, OPAL_ENDCOLUMN); - add_token_u64(&err, dev, column); + add_token_u64(&err, dev, end_column); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_ENDLIST); @@ -1179,6 +1177,16 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, } /* + * request @column from table @table on device @dev. On success, the column + * data will be available in dev->resp->tok[4] + */ +static int generic_get_column(struct opal_dev *dev, const u8 *table, + u64 column) +{ + return generic_get_columns(dev, table, column, column); +} + +/* * see TCG SAS 5.3.2.3 for a description of the available columns * * the result is provided in dev->resp->tok[4] @@ -1437,6 +1445,129 @@ static int setup_locking_range(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int response_get_column(const struct parsed_resp *resp, + int *iter, + u8 column, + u64 *value) +{ + const struct opal_resp_tok *tok; + int n = *iter; + u64 val; + + tok = response_get_token(resp, n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_STARTNAME)) { + pr_debug("Unexpected response token type %d.\n", n); + return OPAL_INVAL_PARAM; + } + n++; + + if (response_get_u64(resp, n) != column) { + pr_debug("Token %d does not match expected column %u.\n", + n, column); + return OPAL_INVAL_PARAM; + } + n++; + + val = response_get_u64(resp, n); + n++; + + tok = response_get_token(resp, n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_ENDNAME)) { + pr_debug("Unexpected response token type %d.\n", n); + return OPAL_INVAL_PARAM; + } + n++; + + *value = val; + *iter = n; + + return 0; +} + +static int locking_range_status(struct opal_dev *dev, void *data) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u64 resp; + bool rlocked, wlocked; + int err, tok_n = 2; + struct opal_lr_status *lrst = data; + + err = build_locking_range(lr_buffer, sizeof(lr_buffer), + lrst->session.opal_key.lr); + if (err) + return err; + + err = generic_get_columns(dev, lr_buffer, OPAL_RANGESTART, + OPAL_WRITELOCKED); + if (err) { + pr_debug("Couldn't get lr %u table columns %d to %d.\n", + lrst->session.opal_key.lr, OPAL_RANGESTART, + OPAL_WRITELOCKED); + return err; + } + + /* range start */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGESTART, + &lrst->range_start); + if (err) + return err; + + /* range length */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGELENGTH, + &lrst->range_length); + if (err) + return err; + + /* RLE */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKENABLED, + &resp); + if (err) + return err; + + lrst->RLE = !!resp; + + /* WLE */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKENABLED, + &resp); + if (err) + return err; + + lrst->WLE = !!resp; + + /* read locked */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKED, &resp); + if (err) + return err; + + rlocked = !!resp; + + /* write locked */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKED, &resp); + if (err) + return err; + + wlocked = !!resp; + + /* opal_lock_state can not map 'read locked' only state. */ + lrst->l_state = OPAL_RW; + if (rlocked && wlocked) + lrst->l_state = OPAL_LK; + else if (wlocked) + lrst->l_state = OPAL_RO; + else if (rlocked) { + pr_debug("Can not report read locked only state.\n"); + return -EINVAL; + } + + return 0; +} + static int start_generic_opal_session(struct opal_dev *dev, enum opal_uid auth, enum opal_uid sp_type, @@ -1759,25 +1890,43 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } -static int add_user_to_lr(struct opal_dev *dev, void *data) +static void add_authority_object_ref(int *err, + struct opal_dev *dev, + const u8 *uid, + size_t uid_len) +{ + add_token_u8(err, dev, OPAL_STARTNAME); + add_token_bytestring(err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(err, dev, uid, uid_len); + add_token_u8(err, dev, OPAL_ENDNAME); +} + +static void add_boolean_object_ref(int *err, + struct opal_dev *dev, + u8 boolean_op) +{ + add_token_u8(err, dev, OPAL_STARTNAME); + add_token_bytestring(err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], + OPAL_UID_LENGTH/2); + add_token_u8(err, dev, boolean_op); + add_token_u8(err, dev, OPAL_ENDNAME); +} + +static int set_lr_boolean_ace(struct opal_dev *dev, + unsigned int opal_uid, + u8 lr, + const u8 *users, + size_t users_len) { u8 lr_buffer[OPAL_UID_LENGTH]; u8 user_uid[OPAL_UID_LENGTH]; - struct opal_lock_unlock *lkul = data; + u8 u; int err; - memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED], - OPAL_UID_LENGTH); - - if (lkul->l_state == OPAL_RW) - memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED], - OPAL_UID_LENGTH); - - lr_buffer[7] = lkul->session.opal_key.lr; - - memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); - - user_uid[7] = lkul->session.who; + memcpy(lr_buffer, opaluid[opal_uid], OPAL_UID_LENGTH); + lr_buffer[7] = lr; err = cmd_start(dev, lr_buffer, opalmethod[OPAL_SET]); @@ -1790,35 +1939,49 @@ static int add_user_to_lr(struct opal_dev *dev, void *data) add_token_u8(&err, dev, OPAL_STARTLIST); + for (u = 0; u < users_len; u++) { + if (users[u] == OPAL_ADMIN1) + memcpy(user_uid, opaluid[OPAL_ADMIN1_UID], + OPAL_UID_LENGTH); + else { + memcpy(user_uid, opaluid[OPAL_USER1_UID], + OPAL_UID_LENGTH); + user_uid[7] = users[u]; + } - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, - opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], - OPAL_UID_LENGTH/2); - add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); - add_token_u8(&err, dev, OPAL_ENDNAME); - - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, - opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], - OPAL_UID_LENGTH/2); - add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); - add_token_u8(&err, dev, OPAL_ENDNAME); - - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], - OPAL_UID_LENGTH/2); - add_token_u8(&err, dev, 1); - add_token_u8(&err, dev, OPAL_ENDNAME); + add_authority_object_ref(&err, dev, user_uid, sizeof(user_uid)); + /* + * Add boolean operator in postfix only with + * two or more authorities being added in ACE + * expresion. + * */ + if (u > 0) + add_boolean_object_ref(&err, dev, OPAL_BOOLEAN_OR); + } add_token_u8(&err, dev, OPAL_ENDLIST); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_ENDLIST); add_token_u8(&err, dev, OPAL_ENDNAME); + return err; +} + +static int add_user_to_lr(struct opal_dev *dev, void *data) +{ + int err; + struct opal_lock_unlock *lkul = data; + const u8 users[] = { + lkul->session.who + }; + + err = set_lr_boolean_ace(dev, + lkul->l_state == OPAL_RW ? + OPAL_LOCKINGRANGE_ACE_WRLOCKED : + OPAL_LOCKINGRANGE_ACE_RDLOCKED, + lkul->session.opal_key.lr, users, + ARRAY_SIZE(users)); if (err) { pr_debug("Error building add user to locking range command.\n"); return err; @@ -1827,6 +1990,27 @@ static int add_user_to_lr(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int add_user_to_lr_ace(struct opal_dev *dev, void *data) +{ + int err; + struct opal_lock_unlock *lkul = data; + const u8 users[] = { + OPAL_ADMIN1, + lkul->session.who + }; + + err = set_lr_boolean_ace(dev, OPAL_LOCKINGRANGE_ACE_START_TO_KEY, + lkul->session.opal_key.lr, users, + ARRAY_SIZE(users)); + + if (err) { + pr_debug("Error building add user to locking ranges ACEs.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + static int lock_unlock_locking_range(struct opal_dev *dev, void *data) { u8 lr_buffer[OPAL_UID_LENGTH]; @@ -2364,6 +2548,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev, const struct opal_step steps[] = { { start_admin1LSP_opal_session, &lk_unlk->session.opal_key }, { add_user_to_lr, lk_unlk }, + { add_user_to_lr_ace, lk_unlk }, { end_opal_session, } }; int ret; @@ -2580,6 +2765,33 @@ static int opal_setup_locking_range(struct opal_dev *dev, return ret; } +static int opal_locking_range_status(struct opal_dev *dev, + struct opal_lr_status *opal_lrst, + void __user *data) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrst->session }, + { locking_range_status, opal_lrst }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + /* skip session info when copying back to uspace */ + if (!ret && copy_to_user(data + offsetof(struct opal_lr_status, range_start), + (void *)opal_lrst + offsetof(struct opal_lr_status, range_start), + sizeof(*opal_lrst) - offsetof(struct opal_lr_status, range_start))) { + pr_debug("Error copying status to userspace\n"); + return -EFAULT; + } + + return ret; +} + static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) { const struct opal_step pw_steps[] = { @@ -2814,6 +3026,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GET_STATUS: ret = opal_get_status(dev, arg); break; + case IOC_OPAL_GET_LR_STATUS: + ret = opal_locking_range_status(dev, p, arg); + break; default: break; } diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 429255876800..64b3a1c76f03 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -735,8 +735,9 @@ static bool update_rs_extent(struct drbd_device *device, return false; } -void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) +void drbd_advance_rs_marks(struct drbd_peer_device *peer_device, unsigned long still_to_go) { + struct drbd_device *device = peer_device->device; unsigned long now = jiffies; unsigned long last = device->rs_mark_time[device->rs_last_mark]; int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; @@ -819,7 +820,7 @@ static int update_sync_bits(struct drbd_device *device, if (mode == SET_IN_SYNC) { unsigned long still_to_go = drbd_bm_total_weight(device); bool rs_is_done = (still_to_go <= device->rs_failed); - drbd_advance_rs_marks(device, still_to_go); + drbd_advance_rs_marks(first_peer_device(device), still_to_go); if (cleared || rs_is_done) maybe_schedule_on_disk_bitmap_update(device, rs_is_done); } else if (mode == RECORD_RS_FAILED) @@ -843,10 +844,11 @@ static bool plausible_request_size(int size) * called by worker on C_SYNC_TARGET and receiver on SyncSource. * */ -int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, +int __drbd_change_sync(struct drbd_peer_device *peer_device, sector_t sector, int size, enum update_sync_bits_mode mode) { /* Is called from worker and receiver context _only_ */ + struct drbd_device *device = peer_device->device; unsigned long sbnr, ebnr, lbnr; unsigned long count = 0; sector_t esector, nr_sectors; @@ -1009,14 +1011,15 @@ retry: * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN * if there is still application IO going on in this area. */ -int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) +int drbd_try_rs_begin_io(struct drbd_peer_device *peer_device, sector_t sector) { + struct drbd_device *device = peer_device->device; unsigned int enr = BM_SECT_TO_EXT(sector); const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; struct lc_element *e; struct bm_extent *bm_ext; int i; - bool throttle = drbd_rs_should_slow_down(device, sector, true); + bool throttle = drbd_rs_should_slow_down(peer_device, sector, true); /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, * not yet BME_LOCKED) extent needs to be kicked out explicitly if we diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 289876ffbc31..6ac8c54b44c7 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1216,7 +1216,9 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned * drbd_bm_read() - Read the whole bitmap from its on disk location. * @device: DRBD device. */ -int drbd_bm_read(struct drbd_device *device) __must_hold(local) +int drbd_bm_read(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) + { return bm_rw(device, BM_AIO_READ, 0); } @@ -1227,7 +1229,8 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local) * * Will only write pages that have changed since last IO. */ -int drbd_bm_write(struct drbd_device *device) __must_hold(local) +int drbd_bm_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) { return bm_rw(device, 0, 0); } @@ -1238,7 +1241,8 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local) * * Will write all pages. */ -int drbd_bm_write_all(struct drbd_device *device) __must_hold(local) +int drbd_bm_write_all(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) { return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0); } @@ -1264,7 +1268,8 @@ int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_ho * verify is aborted due to a failed peer disk, while local IO continues, or * pending resync acks are still being processed. */ -int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local) +int drbd_bm_write_copy_pages(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) { return bm_rw(device, BM_AIO_COPY_PAGES, 0); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index d89b7d03d4c8..a30a5ed811be 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -66,6 +66,7 @@ extern int drbd_proc_details; struct drbd_device; struct drbd_connection; +struct drbd_peer_device; /* Defines to control fault insertion */ enum { @@ -126,8 +127,8 @@ struct bm_xfer_ctx { unsigned bytes[2]; }; -extern void INFO_bm_xfer_stats(struct drbd_device *device, - const char *direction, struct bm_xfer_ctx *c); +extern void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device, + const char *direction, struct bm_xfer_ctx *c); static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) { @@ -541,9 +542,10 @@ struct drbd_md_io { struct bm_io_work { struct drbd_work w; + struct drbd_peer_device *peer_device; char *why; enum bm_flag flags; - int (*io_fn)(struct drbd_device *device); + int (*io_fn)(struct drbd_device *device, struct drbd_peer_device *peer_device); void (*done)(struct drbd_device *device, int rv); }; @@ -1041,7 +1043,7 @@ extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector, enum drbd_packet cmd); extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size); -extern int drbd_send_bitmap(struct drbd_device *device); +extern int drbd_send_bitmap(struct drbd_device *device, struct drbd_peer_device *peer_device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *); @@ -1065,17 +1067,22 @@ extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold extern int drbd_md_test_flag(struct drbd_backing_dev *, int); extern void drbd_md_mark_dirty(struct drbd_device *device); extern void drbd_queue_bitmap_io(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), void (*done)(struct drbd_device *, int), - char *why, enum bm_flag flags); + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device); extern int drbd_bitmap_io(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), - char *why, enum bm_flag flags); + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device); extern int drbd_bitmap_io_from_worker(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), - char *why, enum bm_flag flags); -extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local); -extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local); + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device); +extern int drbd_bmio_set_n_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); +extern int drbd_bmio_clear_n_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); /* Meta data layout * @@ -1284,14 +1291,18 @@ extern void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e); extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr); extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); -extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_read(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); -extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); -extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); -extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_all(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); extern size_t drbd_bm_words(struct drbd_device *device); extern unsigned long drbd_bm_bits(struct drbd_device *device); extern sector_t drbd_bm_capacity(struct drbd_device *device); @@ -1422,21 +1433,24 @@ void drbd_resync_after_changed(struct drbd_device *device); extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side); extern void resume_next_sg(struct drbd_device *device); extern void suspend_other_sg(struct drbd_device *device); -extern int drbd_resync_finished(struct drbd_device *device); +extern int drbd_resync_finished(struct drbd_peer_device *peer_device); /* maybe rather drbd_main.c ? */ extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); extern void drbd_md_put_buffer(struct drbd_device *device); extern int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, sector_t sector, enum req_op op); -extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int); +extern void drbd_ov_out_of_sync_found(struct drbd_peer_device *peer_device, + sector_t sector, int size); extern void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, unsigned int *done); -extern void drbd_rs_controller_reset(struct drbd_device *device); +extern void drbd_rs_controller_reset(struct drbd_peer_device *peer_device); -static inline void ov_out_of_sync_print(struct drbd_device *device) +static inline void ov_out_of_sync_print(struct drbd_peer_device *peer_device) { + struct drbd_device *device = peer_device->device; + if (device->ov_last_oos_size) { - drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n", + drbd_err(peer_device, "Out of sync: start=%llu, size=%lu (sectors)\n", (unsigned long long)device->ov_last_oos_start, (unsigned long)device->ov_last_oos_size); } @@ -1475,7 +1489,7 @@ extern int drbd_ack_receiver(struct drbd_thread *thi); extern void drbd_send_ping_wf(struct work_struct *ws); extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); -extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, +extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, bool throttle_if_app_is_waiting); extern int drbd_submit_peer_request(struct drbd_peer_request *peer_req); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); @@ -1531,22 +1545,22 @@ extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i); extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector); extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector); -extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector); +extern int drbd_try_rs_begin_io(struct drbd_peer_device *peer_device, sector_t sector); extern void drbd_rs_cancel_all(struct drbd_device *device); extern int drbd_rs_del_all(struct drbd_device *device); -extern void drbd_rs_failed_io(struct drbd_device *device, +extern void drbd_rs_failed_io(struct drbd_peer_device *peer_device, sector_t sector, int size); -extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); +extern void drbd_advance_rs_marks(struct drbd_peer_device *peer_device, unsigned long still_to_go); enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; -extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, +extern int __drbd_change_sync(struct drbd_peer_device *peer_device, sector_t sector, int size, enum update_sync_bits_mode mode); -#define drbd_set_in_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_IN_SYNC) -#define drbd_set_out_of_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC) -#define drbd_rs_failed_io(device, sector, size) \ - __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) +#define drbd_set_in_sync(peer_device, sector, size) \ + __drbd_change_sync(peer_device, sector, size, SET_IN_SYNC) +#define drbd_set_out_of_sync(peer_device, sector, size) \ + __drbd_change_sync(peer_device, sector, size, SET_OUT_OF_SYNC) +#define drbd_rs_failed_io(peer_device, sector, size) \ + __drbd_change_sync(peer_device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); extern int drbd_al_initialize(struct drbd_device *, void *); @@ -1918,18 +1932,14 @@ static inline void inc_ap_pending(struct drbd_device *device) atomic_inc(&device->ap_pending_cnt); } -#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ - if (atomic_read(&device->which) < 0) \ - drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n", \ - func, line, \ - atomic_read(&device->which)) - -#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__) -static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line) +#define dec_ap_pending(device) ((void)expect((device), __dec_ap_pending(device) >= 0)) +static inline int __dec_ap_pending(struct drbd_device *device) { - if (atomic_dec_and_test(&device->ap_pending_cnt)) + int ap_pending_cnt = atomic_dec_return(&device->ap_pending_cnt); + + if (ap_pending_cnt == 0) wake_up(&device->misc_wait); - ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); + return ap_pending_cnt; } /* counts how many resync-related answers we still expect from the peer @@ -1938,16 +1948,16 @@ static inline void _dec_ap_pending(struct drbd_device *device, const char *func, * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER) * (or P_NEG_ACK with ID_SYNCER) */ -static inline void inc_rs_pending(struct drbd_device *device) +static inline void inc_rs_pending(struct drbd_peer_device *peer_device) { - atomic_inc(&device->rs_pending_cnt); + atomic_inc(&peer_device->device->rs_pending_cnt); } -#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__) -static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line) +#define dec_rs_pending(peer_device) \ + ((void)expect((peer_device), __dec_rs_pending(peer_device) >= 0)) +static inline int __dec_rs_pending(struct drbd_peer_device *peer_device) { - atomic_dec(&device->rs_pending_cnt); - ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); + return atomic_dec_return(&peer_device->device->rs_pending_cnt); } /* counts how many answers we still need to send to the peer. @@ -1964,18 +1974,16 @@ static inline void inc_unacked(struct drbd_device *device) atomic_inc(&device->unacked_cnt); } -#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__) -static inline void _dec_unacked(struct drbd_device *device, const char *func, int line) +#define dec_unacked(device) ((void)expect(device, __dec_unacked(device) >= 0)) +static inline int __dec_unacked(struct drbd_device *device) { - atomic_dec(&device->unacked_cnt); - ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); + return atomic_dec_return(&device->unacked_cnt); } -#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__) -static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line) +#define sub_unacked(device, n) ((void)expect(device, __sub_unacked(device) >= 0)) +static inline int __sub_unacked(struct drbd_device *device, int n) { - atomic_sub(n, &device->unacked_cnt); - ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); + return atomic_sub_return(n, &device->unacked_cnt); } static inline bool is_sync_target_state(enum drbd_conns connection_state) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2c764f7ee4a7..83987e7a5ef2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -231,9 +231,11 @@ void tl_release(struct drbd_connection *connection, unsigned int barrier_nr, } req = list_prepare_entry(tmp, &connection->transfer_log, tl_requests); list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) { + struct drbd_peer_device *peer_device; if (req->epoch != expect_epoch) break; - _req_mod(req, BARRIER_ACKED); + peer_device = conn_peer_device(connection, req->device->vnr); + _req_mod(req, BARRIER_ACKED, peer_device); } spin_unlock_irq(&connection->resource->req_lock); @@ -256,10 +258,13 @@ bail: /* must hold resource->req_lock */ void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what) { + struct drbd_peer_device *peer_device; struct drbd_request *req, *r; - list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) - _req_mod(req, what); + list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) { + peer_device = conn_peer_device(connection, req->device->vnr); + _req_mod(req, what, peer_device); + } } void tl_restart(struct drbd_connection *connection, enum drbd_req_event what) @@ -297,7 +302,7 @@ void tl_abort_disk_io(struct drbd_device *device) continue; if (req->device != device) continue; - _req_mod(req, ABORT_DISK_IO); + _req_mod(req, ABORT_DISK_IO, NULL); } spin_unlock_irq(&connection->resource->req_lock); } @@ -1198,10 +1203,11 @@ static int fill_bitmap_rle_bits(struct drbd_device *device, * code upon failure. */ static int -send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) +send_bitmap_rle_or_plain(struct drbd_peer_device *peer_device, struct bm_xfer_ctx *c) { - struct drbd_socket *sock = &first_peer_device(device)->connection->data; - unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock = &peer_device->connection->data; + unsigned int header_size = drbd_header_size(peer_device->connection); struct p_compressed_bm *p = sock->sbuf + header_size; int len, err; @@ -1212,7 +1218,7 @@ send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) if (len) { dcbp_set_code(p, RLE_VLI_Bits); - err = __send_command(first_peer_device(device)->connection, device->vnr, sock, + err = __send_command(peer_device->connection, device->vnr, sock, P_COMPRESSED_BITMAP, sizeof(*p) + len, NULL, 0); c->packets[0]++; @@ -1233,7 +1239,8 @@ send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) len = num_words * sizeof(*p); if (len) drbd_bm_get_lel(device, c->word_offset, num_words, p); - err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0); + err = __send_command(peer_device->connection, device->vnr, sock, P_BITMAP, + len, NULL, 0); c->word_offset += num_words; c->bit_offset = c->word_offset * BITS_PER_LONG; @@ -1245,7 +1252,7 @@ send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) } if (!err) { if (len == 0) { - INFO_bm_xfer_stats(device, "send", c); + INFO_bm_xfer_stats(peer_device, "send", c); return 0; } else return 1; @@ -1254,7 +1261,8 @@ send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) } /* See the comment at receive_bitmap() */ -static int _drbd_send_bitmap(struct drbd_device *device) +static int _drbd_send_bitmap(struct drbd_device *device, + struct drbd_peer_device *peer_device) { struct bm_xfer_ctx c; int err; @@ -1266,7 +1274,7 @@ static int _drbd_send_bitmap(struct drbd_device *device) if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) { drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n"); drbd_bm_set_all(device); - if (drbd_bm_write(device)) { + if (drbd_bm_write(device, peer_device)) { /* write_bm did fail! Leave full sync flag set in Meta P_DATA * but otherwise process as per normal - need to tell other * side that a full resync is required! */ @@ -1285,20 +1293,20 @@ static int _drbd_send_bitmap(struct drbd_device *device) }; do { - err = send_bitmap_rle_or_plain(device, &c); + err = send_bitmap_rle_or_plain(peer_device, &c); } while (err > 0); return err == 0; } -int drbd_send_bitmap(struct drbd_device *device) +int drbd_send_bitmap(struct drbd_device *device, struct drbd_peer_device *peer_device) { - struct drbd_socket *sock = &first_peer_device(device)->connection->data; + struct drbd_socket *sock = &peer_device->connection->data; int err = -1; mutex_lock(&sock->mutex); if (sock->socket) - err = !_drbd_send_bitmap(device); + err = !_drbd_send_bitmap(device, peer_device); mutex_unlock(&sock->mutex); return err; } @@ -3406,7 +3414,9 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) * * Sets all bits in the bitmap and writes the whole bitmap to stable storage. */ -int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) +int drbd_bmio_set_n_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) + { int rv = -EIO; @@ -3414,7 +3424,7 @@ int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) drbd_md_sync(device); drbd_bm_set_all(device); - rv = drbd_bm_write(device); + rv = drbd_bm_write(device, peer_device); if (!rv) { drbd_md_clear_flag(device, MDF_FULL_SYNC); @@ -3430,11 +3440,13 @@ int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) * * Clears all bits in the bitmap and writes the whole bitmap to stable storage. */ -int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local) +int drbd_bmio_clear_n_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) + { drbd_resume_al(device); drbd_bm_clear_all(device); - return drbd_bm_write(device); + return drbd_bm_write(device, peer_device); } static int w_bitmap_io(struct drbd_work *w, int unused) @@ -3453,7 +3465,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused) if (get_ldev(device)) { drbd_bm_lock(device, work->why, work->flags); - rv = work->io_fn(device); + rv = work->io_fn(device, work->peer_device); drbd_bm_unlock(device); put_ldev(device); } @@ -3488,11 +3500,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused) * put_ldev(). */ void drbd_queue_bitmap_io(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), void (*done)(struct drbd_device *, int), - char *why, enum bm_flag flags) + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device) { - D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); + D_ASSERT(device, current == peer_device->connection->worker.task); D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags)); D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags)); @@ -3501,6 +3514,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n", why, device->bm_io_work.why); + device->bm_io_work.peer_device = peer_device; device->bm_io_work.io_fn = io_fn; device->bm_io_work.done = done; device->bm_io_work.why = why; @@ -3512,7 +3526,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * application IO does not conflict anyways. */ if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) - drbd_queue_work(&first_peer_device(device)->connection->sender_work, + drbd_queue_work(&peer_device->connection->sender_work, &device->bm_io_work.w); } spin_unlock_irq(&device->resource->req_lock); @@ -3528,8 +3542,10 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * freezes application IO while that the actual IO operations runs. This * functions MAY NOT be called from worker context. */ -int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), - char *why, enum bm_flag flags) +int drbd_bitmap_io(struct drbd_device *device, + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device) { /* Only suspend io, if some operation is supposed to be locked out */ const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST); @@ -3541,7 +3557,7 @@ int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device * drbd_suspend_io(device); drbd_bm_lock(device, why, flags); - rv = io_fn(device); + rv = io_fn(device, peer_device); drbd_bm_unlock(device); if (do_suspend_io) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 60757ac31701..8967298968f3 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1053,7 +1053,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, - "size changed", BM_LOCKED_MASK); + "size changed", BM_LOCKED_MASK, NULL); /* on-disk bitmap and activity log is authoritative again * (unless there was an IO error meanwhile...) */ @@ -2027,13 +2027,15 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_info(device, "Assuming that all blocks are out of sync " "(aka FullSync)\n"); if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, - "set_n_write from attaching", BM_LOCKED_MASK)) { + "set_n_write from attaching", BM_LOCKED_MASK, + NULL)) { retcode = ERR_IO_MD_DISK; goto force_diskless_dec; } } else { if (drbd_bitmap_io(device, &drbd_bm_read, - "read from attaching", BM_LOCKED_MASK)) { + "read from attaching", BM_LOCKED_MASK, + NULL)) { retcode = ERR_IO_MD_DISK; goto force_diskless_dec; } @@ -2972,7 +2974,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT)); if (retcode >= SS_SUCCESS) { if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, - "set_n_write from invalidate", BM_LOCKED_MASK)) + "set_n_write from invalidate", BM_LOCKED_MASK, NULL)) retcode = ERR_IO_MD_DISK; } } else @@ -3005,11 +3007,12 @@ out: return 0; } -static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local) +static int drbd_bmio_set_susp_al(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) { int rv; - rv = drbd_bmio_set_n_write(device); + rv = drbd_bmio_set_n_write(device, peer_device); drbd_suspend_al(device); return rv; } @@ -3052,7 +3055,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) if (retcode >= SS_SUCCESS) { if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al, "set_n_write from invalidate_peer", - BM_LOCKED_SET_ALLOWED)) + BM_LOCKED_SET_ALLOWED, NULL)) retcode = ERR_IO_MD_DISK; } } else @@ -4148,7 +4151,7 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) if (args.clear_bm) { err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write, - "clear_n_write from new_c_uuid", BM_LOCKED_MASK); + "clear_n_write from new_c_uuid", BM_LOCKED_MASK, NULL); if (err) { drbd_err(device, "Writing bitmap failed with %d\n", err); retcode = ERR_IO_MD_DISK; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 757f4692b5bd..e54404c632e7 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2044,11 +2044,11 @@ static int e_end_resync_block(struct drbd_work *w, int unused) D_ASSERT(device, drbd_interval_empty(&peer_req->i)); if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { - drbd_set_in_sync(device, sector, peer_req->i.size); + drbd_set_in_sync(peer_device, sector, peer_req->i.size); err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req); } else { /* Record failure to sync */ - drbd_rs_failed_io(device, sector, peer_req->i.size); + drbd_rs_failed_io(peer_device, sector, peer_req->i.size); err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); } @@ -2067,7 +2067,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto if (!peer_req) goto fail; - dec_rs_pending(device); + dec_rs_pending(peer_device); inc_unacked(device); /* corresponding dec_unacked() in e_end_resync_block() @@ -2138,7 +2138,7 @@ static int receive_DataReply(struct drbd_connection *connection, struct packet_i err = recv_dless_read(peer_device, req, sector, pi->size); if (!err) - req_mod(req, DATA_RECEIVED); + req_mod(req, DATA_RECEIVED, peer_device); /* else: nothing. handled from drbd_disconnect... * I don't think we may complete this just yet * in case we are "on-disconnect: freeze" */ @@ -2196,7 +2196,7 @@ static void restart_conflicting_writes(struct drbd_device *device, continue; /* as it is RQ_POSTPONED, this will cause it to * be queued on the retry workqueue. */ - __req_mod(req, CONFLICT_RESOLVED, NULL); + __req_mod(req, CONFLICT_RESOLVED, NULL, NULL); } } @@ -2220,7 +2220,7 @@ static int e_end_block(struct drbd_work *w, int cancel) P_RS_WRITE_ACK : P_WRITE_ACK; err = drbd_send_ack(peer_device, pcmd, peer_req); if (pcmd == P_RS_WRITE_ACK) - drbd_set_in_sync(device, sector, peer_req->i.size); + drbd_set_in_sync(peer_device, sector, peer_req->i.size); } else { err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); /* we expect it to be marked out of sync anyways... @@ -2420,6 +2420,7 @@ static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf) static void fail_postponed_requests(struct drbd_device *device, sector_t sector, unsigned int size) { + struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_interval *i; repeat: @@ -2433,7 +2434,7 @@ static void fail_postponed_requests(struct drbd_device *device, sector_t sector, if (!(req->rq_state & RQ_POSTPONED)) continue; req->rq_state &= ~RQ_POSTPONED; - __req_mod(req, NEG_ACKED, &m); + __req_mod(req, NEG_ACKED, peer_device, &m); spin_unlock_irq(&device->resource->req_lock); if (m.bio) complete_master_bio(device, &m); @@ -2690,7 +2691,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * if (device->state.pdsk < D_INCONSISTENT) { /* In case we have the only disk of the cluster, */ - drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); + drbd_set_out_of_sync(peer_device, peer_req->i.sector, peer_req->i.size); peer_req->flags &= ~EE_MAY_SET_IN_SYNC; drbd_al_begin_io(device, &peer_req->i); peer_req->flags |= EE_CALL_AL_COMPLETE_IO; @@ -2729,9 +2730,10 @@ out_interrupted: * The current sync rate used here uses only the most recent two step marks, * to have a short time average so we can react faster. */ -bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, +bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, bool throttle_if_app_is_waiting) { + struct drbd_device *device = peer_device->device; struct lc_element *tmp; bool throttle = drbd_rs_c_min_rate_throttle(device); @@ -2843,7 +2845,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet break; case P_OV_REPLY: verb = 0; - dec_rs_pending(device); + dec_rs_pending(peer_device); drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC); break; default: @@ -2914,7 +2916,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet /* track progress, we may need to throttle */ atomic_add(size >> 9, &device->rs_sect_in); peer_req->w.cb = w_e_end_ov_reply; - dec_rs_pending(device); + dec_rs_pending(peer_device); /* drbd_rs_begin_io done when we sent this request, * but accounting still needs to be done. */ goto submit_for_resync; @@ -2977,7 +2979,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet update_receiver_timing_details(connection, drbd_rs_should_slow_down); if (device->state.peer != R_PRIMARY - && drbd_rs_should_slow_down(device, sector, false)) + && drbd_rs_should_slow_down(peer_device, sector, false)) schedule_timeout_uninterruptible(HZ/10); update_receiver_timing_details(connection, drbd_rs_begin_io); if (drbd_rs_begin_io(device, sector)) @@ -3226,10 +3228,11 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, -1096 requires proto 96 */ -static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local) +static int drbd_uuid_compare(struct drbd_peer_device *const peer_device, + enum drbd_role const peer_role, int *rule_nr) __must_hold(local) { - struct drbd_peer_device *const peer_device = first_peer_device(device); - struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + struct drbd_connection *const connection = peer_device->connection; + struct drbd_device *device = peer_device->device; u64 self, peer; int i, j; @@ -3465,7 +3468,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); - hg = drbd_uuid_compare(device, peer_role, &rule_nr); + hg = drbd_uuid_compare(peer_device, peer_role, &rule_nr); spin_unlock_irq(&device->ldev->md.uuid_lock); drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); @@ -3591,7 +3594,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, if (abs(hg) >= 2) { drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake", - BM_LOCKED_SET_ALLOWED)) + BM_LOCKED_SET_ALLOWED, NULL)) return C_MASK; } @@ -4270,7 +4273,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n"); drbd_bitmap_io(device, &drbd_bmio_clear_n_write, "clear_n_write from receive_uuids", - BM_LOCKED_TEST_ALLOWED); + BM_LOCKED_TEST_ALLOWED, NULL); _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]); _drbd_uuid_set(device, UI_BITMAP, 0); _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), @@ -4448,7 +4451,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info else if (os.conn >= C_SYNC_SOURCE && peer_state.conn == C_CONNECTED) { if (drbd_bm_total_weight(device) <= device->rs_failed) - drbd_resync_finished(device); + drbd_resync_finished(peer_device); return 0; } } @@ -4456,8 +4459,8 @@ static int receive_state(struct drbd_connection *connection, struct packet_info /* explicit verify finished notification, stop sector reached. */ if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(peer_device); + drbd_resync_finished(peer_device); return 0; } @@ -4766,11 +4769,11 @@ decode_bitmap_c(struct drbd_peer_device *peer_device, return -EIO; } -void INFO_bm_xfer_stats(struct drbd_device *device, +void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device, const char *direction, struct bm_xfer_ctx *c) { /* what would it take to transfer it "plaintext" */ - unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + unsigned int header_size = drbd_header_size(peer_device->connection); unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; unsigned int plain = header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + @@ -4794,7 +4797,7 @@ void INFO_bm_xfer_stats(struct drbd_device *device, r = 1000; r = 1000 - r; - drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " + drbd_info(peer_device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " "total %u; compression: %u.%u%%\n", direction, c->bytes[1], c->packets[1], @@ -4872,12 +4875,12 @@ static int receive_bitmap(struct drbd_connection *connection, struct packet_info goto out; } - INFO_bm_xfer_stats(device, "receive", &c); + INFO_bm_xfer_stats(peer_device, "receive", &c); if (device->state.conn == C_WF_BITMAP_T) { enum drbd_state_rv rv; - err = drbd_send_bitmap(device); + err = drbd_send_bitmap(device, peer_device); if (err) goto out; /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ @@ -4935,7 +4938,7 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet drbd_conn_str(device->state.conn)); } - drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); + drbd_set_out_of_sync(peer_device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); return 0; } @@ -4956,7 +4959,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); - dec_rs_pending(device); + dec_rs_pending(peer_device); if (get_ldev(device)) { struct drbd_peer_request *peer_req; @@ -5214,7 +5217,7 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) if (get_ldev(device)) { drbd_bitmap_io(device, &drbd_bm_write_copy_pages, - "write from disconnected", BM_LOCKED_CHANGE_ALLOWED); + "write from disconnected", BM_LOCKED_CHANGE_ALLOWED, NULL); put_ldev(device); } @@ -5648,22 +5651,23 @@ static int got_IsInSync(struct drbd_connection *connection, struct packet_info * if (get_ldev(device)) { drbd_rs_complete_io(device, sector); - drbd_set_in_sync(device, sector, blksize); + drbd_set_in_sync(peer_device, sector, blksize); /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); put_ldev(device); } - dec_rs_pending(device); + dec_rs_pending(peer_device); atomic_add(blksize >> 9, &device->rs_sect_in); return 0; } static int -validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector, +validate_req_change_req_state(struct drbd_peer_device *peer_device, u64 id, sector_t sector, struct rb_root *root, const char *func, enum drbd_req_event what, bool missing_ok) { + struct drbd_device *device = peer_device->device; struct drbd_request *req; struct bio_and_error m; @@ -5673,7 +5677,7 @@ validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t secto spin_unlock_irq(&device->resource->req_lock); return -EIO; } - __req_mod(req, what, &m); + __req_mod(req, what, peer_device, &m); spin_unlock_irq(&device->resource->req_lock); if (m.bio) @@ -5698,8 +5702,8 @@ static int got_BlockAck(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (p->block_id == ID_SYNCER) { - drbd_set_in_sync(device, sector, blksize); - dec_rs_pending(device); + drbd_set_in_sync(peer_device, sector, blksize); + dec_rs_pending(peer_device); return 0; } switch (pi->cmd) { @@ -5722,7 +5726,7 @@ static int got_BlockAck(struct drbd_connection *connection, struct packet_info * BUG(); } - return validate_req_change_req_state(device, p->block_id, sector, + return validate_req_change_req_state(peer_device, p->block_id, sector, &device->write_requests, __func__, what, false); } @@ -5744,12 +5748,12 @@ static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (p->block_id == ID_SYNCER) { - dec_rs_pending(device); - drbd_rs_failed_io(device, sector, size); + dec_rs_pending(peer_device); + drbd_rs_failed_io(peer_device, sector, size); return 0; } - err = validate_req_change_req_state(device, p->block_id, sector, + err = validate_req_change_req_state(peer_device, p->block_id, sector, &device->write_requests, __func__, NEG_ACKED, true); if (err) { @@ -5758,7 +5762,7 @@ static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi request is no longer in the collision hash. */ /* In Protocol B we might already have got a P_RECV_ACK but then get a P_NEG_ACK afterwards. */ - drbd_set_out_of_sync(device, sector, size); + drbd_set_out_of_sync(peer_device, sector, size); } return 0; } @@ -5780,7 +5784,7 @@ static int got_NegDReply(struct drbd_connection *connection, struct packet_info drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n", (unsigned long long)sector, be32_to_cpu(p->blksize)); - return validate_req_change_req_state(device, p->block_id, sector, + return validate_req_change_req_state(peer_device, p->block_id, sector, &device->read_requests, __func__, NEG_ACKED, false); } @@ -5803,13 +5807,13 @@ static int got_NegRSDReply(struct drbd_connection *connection, struct packet_inf update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); - dec_rs_pending(device); + dec_rs_pending(peer_device); if (get_ldev_if_state(device, D_FAILED)) { drbd_rs_complete_io(device, sector); switch (pi->cmd) { case P_NEG_RS_DREPLY: - drbd_rs_failed_io(device, sector, size); + drbd_rs_failed_io(peer_device, sector, size); break; case P_RS_CANCEL: break; @@ -5866,21 +5870,21 @@ static int got_OVResult(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) - drbd_ov_out_of_sync_found(device, sector, size); + drbd_ov_out_of_sync_found(peer_device, sector, size); else - ov_out_of_sync_print(device); + ov_out_of_sync_print(peer_device); if (!get_ldev(device)) return 0; drbd_rs_complete_io(device, sector); - dec_rs_pending(device); + dec_rs_pending(peer_device); --device->ov_left; /* let's advance progress step marks only for every other megabyte */ if ((device->ov_left & 0x200) == 0x200) - drbd_advance_rs_marks(device, device->ov_left); + drbd_advance_rs_marks(peer_device, device->ov_left); if (device->ov_left == 0) { dw = kmalloc(sizeof(*dw), GFP_NOIO); @@ -5890,8 +5894,8 @@ static int got_OVResult(struct drbd_connection *connection, struct packet_info * drbd_queue_work(&peer_device->connection->sender_work, &dw->w); } else { drbd_err(device, "kmalloc(dw) failed."); - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(peer_device); + drbd_resync_finished(peer_device); } } put_ldev(device); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index e36216d50753..380e6584a4ee 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -122,12 +122,13 @@ void drbd_req_destroy(struct kref *kref) * before it even was submitted or sent. * In that case we do not want to touch the bitmap at all. */ + struct drbd_peer_device *peer_device = first_peer_device(device); if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) - drbd_set_out_of_sync(device, req->i.sector, req->i.size); + drbd_set_out_of_sync(peer_device, req->i.sector, req->i.size); if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) - drbd_set_in_sync(device, req->i.sector, req->i.size); + drbd_set_in_sync(peer_device, req->i.sector, req->i.size); } /* one might be tempted to move the drbd_al_complete_io @@ -552,12 +553,15 @@ static inline bool is_pending_write_protocol_A(struct drbd_request *req) * happen "atomically" within the req_lock, * and it enforces that we have to think in a very structured manner * about the "events" that may happen to a request during its life time ... + * + * + * peer_device == NULL means local disk */ int __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct drbd_peer_device *peer_device, struct bio_and_error *m) { struct drbd_device *const device = req->device; - struct drbd_peer_device *const peer_device = first_peer_device(device); struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; struct net_conf *nc; int p, rv = 0; @@ -617,7 +621,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case READ_COMPLETED_WITH_ERROR: - drbd_set_out_of_sync(device, req->i.sector, req->i.size); + drbd_set_out_of_sync(peer_device, req->i.sector, req->i.size); drbd_report_io_error(device, req); __drbd_chk_io_error(device, DRBD_READ_ERROR); fallthrough; @@ -1100,6 +1104,7 @@ static bool drbd_should_send_out_of_sync(union drbd_dev_state s) static int drbd_process_write_request(struct drbd_request *req) { struct drbd_device *device = req->device; + struct drbd_peer_device *peer_device = first_peer_device(device); int remote, send_oos; remote = drbd_should_do_remote(device->state); @@ -1115,7 +1120,7 @@ static int drbd_process_write_request(struct drbd_request *req) /* The only size==0 bios we expect are empty flushes. */ D_ASSERT(device, req->master_bio->bi_opf & REQ_PREFLUSH); if (remote) - _req_mod(req, QUEUE_AS_DRBD_BARRIER); + _req_mod(req, QUEUE_AS_DRBD_BARRIER, peer_device); return remote; } @@ -1125,10 +1130,10 @@ static int drbd_process_write_request(struct drbd_request *req) D_ASSERT(device, !(remote && send_oos)); if (remote) { - _req_mod(req, TO_BE_SENT); - _req_mod(req, QUEUE_FOR_NET_WRITE); - } else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size)) - _req_mod(req, QUEUE_FOR_SEND_OOS); + _req_mod(req, TO_BE_SENT, peer_device); + _req_mod(req, QUEUE_FOR_NET_WRITE, peer_device); + } else if (drbd_set_out_of_sync(peer_device, req->i.sector, req->i.size)) + _req_mod(req, QUEUE_FOR_SEND_OOS, peer_device); return remote; } @@ -1312,6 +1317,7 @@ static void drbd_update_plug(struct drbd_plug_cb *plug, struct drbd_request *req static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) { struct drbd_resource *resource = device->resource; + struct drbd_peer_device *peer_device = first_peer_device(device); const int rw = bio_data_dir(req->master_bio); struct bio_and_error m = { NULL, }; bool no_remote = false; @@ -1375,8 +1381,8 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request /* We either have a private_bio, or we can read from remote. * Otherwise we had done the goto nodata above. */ if (req->private_bio == NULL) { - _req_mod(req, TO_BE_SENT); - _req_mod(req, QUEUE_FOR_NET_READ); + _req_mod(req, TO_BE_SENT, peer_device); + _req_mod(req, QUEUE_FOR_NET_READ, peer_device); } else no_remote = true; } @@ -1397,7 +1403,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request req->pre_submit_jif = jiffies; list_add_tail(&req->req_pending_local, &device->pending_completion[rw == WRITE]); - _req_mod(req, TO_BE_SUBMITTED); + _req_mod(req, TO_BE_SUBMITTED, NULL); /* but we need to give up the spinlock to submit */ submit_private_bio = true; } else if (no_remote) { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index b4017b5c3fbc..9ae860e7591b 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -267,6 +267,7 @@ struct bio_and_error { extern void start_new_tl_epoch(struct drbd_connection *connection); extern void drbd_req_destroy(struct kref *kref); extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct drbd_peer_device *peer_device, struct bio_and_error *m); extern void complete_master_bio(struct drbd_device *device, struct bio_and_error *m); @@ -280,14 +281,15 @@ extern void drbd_restart_request(struct drbd_request *req); /* use this if you don't want to deal with calling complete_master_bio() * outside the spinlock, e.g. when walking some list on cleanup. */ -static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) +static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what, + struct drbd_peer_device *peer_device) { struct drbd_device *device = req->device; struct bio_and_error m; int rv; /* __req_mod possibly frees req, do not touch req after that! */ - rv = __req_mod(req, what, &m); + rv = __req_mod(req, what, peer_device, &m); if (m.bio) complete_master_bio(device, &m); @@ -299,7 +301,8 @@ static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) * of the lower level driver completion callback, so we need to * spin_lock_irqsave here. */ static inline int req_mod(struct drbd_request *req, - enum drbd_req_event what) + enum drbd_req_event what, + struct drbd_peer_device *peer_device) { unsigned long flags; struct drbd_device *device = req->device; @@ -307,7 +310,7 @@ static inline int req_mod(struct drbd_request *req, int rv; spin_lock_irqsave(&device->resource->req_lock, flags); - rv = __req_mod(req, what, &m); + rv = __req_mod(req, what, peer_device, &m); spin_unlock_irqrestore(&device->resource->req_lock, flags); if (m.bio) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 75d13ea0024f..563e67f1ead9 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -1222,9 +1222,11 @@ void drbd_resume_al(struct drbd_device *device) } /* helper for _drbd_set_state */ -static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) +static void set_ov_position(struct drbd_peer_device *peer_device, enum drbd_conns cs) { - if (first_peer_device(device)->connection->agreed_pro_version < 90) + struct drbd_device *device = peer_device->device; + + if (peer_device->connection->agreed_pro_version < 90) device->ov_start_sector = 0; device->rs_total = drbd_bm_bits(device); device->ov_position = 0; @@ -1387,7 +1389,7 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns, unsigned long now = jiffies; int i; - set_ov_position(device, ns.conn); + set_ov_position(peer_device, ns.conn); device->rs_start = now; device->rs_last_sect_ev = 0; device->ov_last_oos_size = 0; @@ -1398,7 +1400,7 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns, device->rs_mark_time[i] = now; } - drbd_rs_controller_reset(device); + drbd_rs_controller_reset(peer_device); if (ns.conn == C_VERIFY_S) { drbd_info(device, "Starting Online Verify from sector %llu\n", @@ -1518,8 +1520,9 @@ static void abw_start_sync(struct drbd_device *device, int rv) } int drbd_bitmap_io_from_worker(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), - char *why, enum bm_flag flags) + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device) { int rv; @@ -1529,7 +1532,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, atomic_inc(&device->suspend_cnt); drbd_bm_lock(device, why, flags); - rv = io_fn(device); + rv = io_fn(device, peer_device); drbd_bm_unlock(device); drbd_resume_io(device); @@ -1809,7 +1812,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, device->state.conn == C_WF_BITMAP_S) drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)", - BM_LOCKED_TEST_ALLOWED); + BM_LOCKED_TEST_ALLOWED, peer_device); /* Lost contact to peer's copy of the data */ if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) { @@ -1839,7 +1842,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, * No harm done if the bitmap still changes, * redirtied pages will follow later. */ drbd_bitmap_io_from_worker(device, &drbd_bm_write, - "demote diskless peer", BM_LOCKED_SET_ALLOWED); + "demote diskless peer", BM_LOCKED_SET_ALLOWED, peer_device); put_ldev(device); } @@ -1851,7 +1854,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* No changes to the bitmap expected this time, so assert that, * even though no harm was done if it did change. */ drbd_bitmap_io_from_worker(device, &drbd_bm_write, - "demote", BM_LOCKED_TEST_ALLOWED); + "demote", BM_LOCKED_TEST_ALLOWED, peer_device); put_ldev(device); } @@ -1888,7 +1891,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* no other bitmap changes expected during this phase */ drbd_queue_bitmap_io(device, &drbd_bmio_set_n_write, &abw_start_sync, - "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); + "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED, + peer_device); /* first half of local IO error, failure to attach, * or administrative detach */ @@ -2011,7 +2015,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) && (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) { drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, - "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED, + peer_device); put_ldev(device); } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f46738040d6b..4352a50fbb3f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -28,8 +28,8 @@ #include "drbd_protocol.h" #include "drbd_req.h" -static int make_ov_request(struct drbd_device *, int); -static int make_resync_request(struct drbd_device *, int); +static int make_ov_request(struct drbd_peer_device *, int); +static int make_resync_request(struct drbd_peer_device *, int); /* endio handlers: * drbd_md_endio (defined here) @@ -124,7 +124,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l * In case of a write error, send the neg ack anyways. */ if (!__test_and_set_bit(__EE_SEND_WRITE_ACK, &peer_req->flags)) inc_unacked(device); - drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); + drbd_set_out_of_sync(peer_device, peer_req->i.sector, peer_req->i.size); } spin_lock_irqsave(&device->resource->req_lock, flags); @@ -276,7 +276,7 @@ void drbd_request_endio(struct bio *bio) /* not req_mod(), we need irqsave here! */ spin_lock_irqsave(&device->resource->req_lock, flags); - __req_mod(req, what, &m); + __req_mod(req, what, NULL, &m); spin_unlock_irqrestore(&device->resource->req_lock, flags); put_ldev(device); @@ -363,7 +363,7 @@ static int w_e_send_csum(struct drbd_work *w, int cancel) * drbd_alloc_pages due to pp_in_use > max_buffers. */ drbd_free_peer_req(device, peer_req); peer_req = NULL; - inc_rs_pending(device); + inc_rs_pending(peer_device); err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_CSUM_RS_REQUEST); @@ -430,10 +430,10 @@ int w_resync_timer(struct drbd_work *w, int cancel) switch (device->state.conn) { case C_VERIFY_S: - make_ov_request(device, cancel); + make_ov_request(first_peer_device(device), cancel); break; case C_SYNC_TARGET: - make_resync_request(device, cancel); + make_resync_request(first_peer_device(device), cancel); break; } @@ -493,8 +493,9 @@ struct fifo_buffer *fifo_alloc(unsigned int fifo_size) return fb; } -static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) +static int drbd_rs_controller(struct drbd_peer_device *peer_device, unsigned int sect_in) { + struct drbd_device *device = peer_device->device; struct disk_conf *dc; unsigned int want; /* The number of sectors we want in-flight */ int req_sect; /* Number of sectors to request in this turn */ @@ -545,8 +546,9 @@ static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) return req_sect; } -static int drbd_rs_number_requests(struct drbd_device *device) +static int drbd_rs_number_requests(struct drbd_peer_device *peer_device) { + struct drbd_device *device = peer_device->device; unsigned int sect_in; /* Number of sectors that came in since the last turn */ int number, mxb; @@ -556,7 +558,7 @@ static int drbd_rs_number_requests(struct drbd_device *device) rcu_read_lock(); mxb = drbd_get_max_buffers(device) / 2; if (rcu_dereference(device->rs_plan_s)->size) { - number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9); + number = drbd_rs_controller(peer_device, sect_in) >> (BM_BLOCK_SHIFT - 9); device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; } else { device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; @@ -580,9 +582,9 @@ static int drbd_rs_number_requests(struct drbd_device *device) return number; } -static int make_resync_request(struct drbd_device *const device, int cancel) +static int make_resync_request(struct drbd_peer_device *const peer_device, int cancel) { - struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_device *const device = peer_device->device; struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; unsigned long bit; sector_t sector; @@ -598,7 +600,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel) if (device->rs_total == 0) { /* empty resync? */ - drbd_resync_finished(device); + drbd_resync_finished(peer_device); return 0; } @@ -618,7 +620,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel) } max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; - number = drbd_rs_number_requests(device); + number = drbd_rs_number_requests(peer_device); if (number <= 0) goto requeue; @@ -653,7 +655,7 @@ next_sector: sector = BM_BIT_TO_SECT(bit); - if (drbd_try_rs_begin_io(device, sector)) { + if (drbd_try_rs_begin_io(peer_device, sector)) { device->bm_resync_fo = bit; goto requeue; } @@ -729,13 +731,13 @@ next_sector: } else { int err; - inc_rs_pending(device); + inc_rs_pending(peer_device); err = drbd_send_drequest(peer_device, size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST, sector, size, ID_SYNCER); if (err) { drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); - dec_rs_pending(device); + dec_rs_pending(peer_device); put_ldev(device); return err; } @@ -760,8 +762,9 @@ next_sector: return 0; } -static int make_ov_request(struct drbd_device *device, int cancel) +static int make_ov_request(struct drbd_peer_device *peer_device, int cancel) { + struct drbd_device *device = peer_device->device; int number, i, size; sector_t sector; const sector_t capacity = get_capacity(device->vdisk); @@ -770,7 +773,7 @@ static int make_ov_request(struct drbd_device *device, int cancel) if (unlikely(cancel)) return 1; - number = drbd_rs_number_requests(device); + number = drbd_rs_number_requests(peer_device); sector = device->ov_position; for (i = 0; i < number; i++) { @@ -788,7 +791,7 @@ static int make_ov_request(struct drbd_device *device, int cancel) size = BM_BLOCK_SIZE; - if (drbd_try_rs_begin_io(device, sector)) { + if (drbd_try_rs_begin_io(peer_device, sector)) { device->ov_position = sector; goto requeue; } @@ -796,9 +799,9 @@ static int make_ov_request(struct drbd_device *device, int cancel) if (sector + (size>>9) > capacity) size = (capacity-sector)<<9; - inc_rs_pending(device); + inc_rs_pending(peer_device); if (drbd_send_ov_request(first_peer_device(device), sector, size)) { - dec_rs_pending(device); + dec_rs_pending(peer_device); return 0; } sector += BM_SECT_PER_BIT; @@ -818,8 +821,8 @@ int w_ov_finished(struct drbd_work *w, int cancel) container_of(w, struct drbd_device_work, w); struct drbd_device *device = dw->device; kfree(dw); - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(first_peer_device(device)); + drbd_resync_finished(first_peer_device(device)); return 0; } @@ -831,7 +834,7 @@ static int w_resync_finished(struct drbd_work *w, int cancel) struct drbd_device *device = dw->device; kfree(dw); - drbd_resync_finished(device); + drbd_resync_finished(first_peer_device(device)); return 0; } @@ -846,9 +849,10 @@ static void ping_peer(struct drbd_device *device) test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED); } -int drbd_resync_finished(struct drbd_device *device) +int drbd_resync_finished(struct drbd_peer_device *peer_device) { - struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_device *device = peer_device->device; + struct drbd_connection *connection = peer_device->connection; unsigned long db, dt, dbdt; unsigned long n_oos; union drbd_state os, ns; @@ -1129,7 +1133,7 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req); } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { if (likely(device->state.pdsk >= D_INCONSISTENT)) { - inc_rs_pending(device); + inc_rs_pending(peer_device); if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req)) err = drbd_send_rs_deallocated(peer_device, peer_req); else @@ -1148,7 +1152,7 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req); /* update resync data with failure */ - drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size); + drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size); } dec_unacked(device); @@ -1199,12 +1203,12 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) } if (eq) { - drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size); + drbd_set_in_sync(peer_device, peer_req->i.sector, peer_req->i.size); /* rs_same_csums unit is BM_BLOCK_SIZE */ device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req); } else { - inc_rs_pending(device); + inc_rs_pending(peer_device); peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ kfree(di); @@ -1257,10 +1261,10 @@ int w_e_end_ov_req(struct drbd_work *w, int cancel) * drbd_alloc_pages due to pp_in_use > max_buffers. */ drbd_free_peer_req(device, peer_req); peer_req = NULL; - inc_rs_pending(device); + inc_rs_pending(peer_device); err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY); if (err) - dec_rs_pending(device); + dec_rs_pending(peer_device); kfree(digest); out: @@ -1270,15 +1274,16 @@ out: return err; } -void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size) +void drbd_ov_out_of_sync_found(struct drbd_peer_device *peer_device, sector_t sector, int size) { + struct drbd_device *device = peer_device->device; if (device->ov_last_oos_start + device->ov_last_oos_size == sector) { device->ov_last_oos_size += size>>9; } else { device->ov_last_oos_start = sector; device->ov_last_oos_size = size>>9; } - drbd_set_out_of_sync(device, sector, size); + drbd_set_out_of_sync(peer_device, sector, size); } int w_e_end_ov_reply(struct drbd_work *w, int cancel) @@ -1328,9 +1333,9 @@ int w_e_end_ov_reply(struct drbd_work *w, int cancel) * drbd_alloc_pages due to pp_in_use > max_buffers. */ drbd_free_peer_req(device, peer_req); if (!eq) - drbd_ov_out_of_sync_found(device, sector, size); + drbd_ov_out_of_sync_found(peer_device, sector, size); else - ov_out_of_sync_print(device); + ov_out_of_sync_print(peer_device); err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); @@ -1341,14 +1346,14 @@ int w_e_end_ov_reply(struct drbd_work *w, int cancel) /* let's advance progress step marks only for every other megabyte */ if ((device->ov_left & 0x200) == 0x200) - drbd_advance_rs_marks(device, device->ov_left); + drbd_advance_rs_marks(peer_device, device->ov_left); stop_sector_reached = verify_can_do_stop_sector(device) && (sector + (size>>9)) >= device->ov_stop_sector; if (device->ov_left == 0 || stop_sector_reached) { - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(peer_device); + drbd_resync_finished(peer_device); } return err; @@ -1425,7 +1430,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); + req_mod(req, SEND_CANCELED, peer_device); return 0; } req->pre_send_jif = jiffies; @@ -1437,7 +1442,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel) maybe_send_barrier(connection, req->epoch); err = drbd_send_out_of_sync(peer_device, req); - req_mod(req, OOS_HANDED_TO_NETWORK); + req_mod(req, OOS_HANDED_TO_NETWORK, peer_device); return err; } @@ -1457,7 +1462,7 @@ int w_send_dblock(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); + req_mod(req, SEND_CANCELED, peer_device); return 0; } req->pre_send_jif = jiffies; @@ -1467,7 +1472,7 @@ int w_send_dblock(struct drbd_work *w, int cancel) connection->send.current_epoch_writes++; err = drbd_send_dblock(peer_device, req); - req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK, peer_device); if (do_send_unplug && !err) pd_send_unplug_remote(peer_device); @@ -1490,7 +1495,7 @@ int w_send_read_req(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); + req_mod(req, SEND_CANCELED, peer_device); return 0; } req->pre_send_jif = jiffies; @@ -1502,7 +1507,7 @@ int w_send_read_req(struct drbd_work *w, int cancel) err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size, (unsigned long)req); - req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK, peer_device); if (do_send_unplug && !err) pd_send_unplug_remote(peer_device); @@ -1668,8 +1673,9 @@ void drbd_resync_after_changed(struct drbd_device *device) } while (changed); } -void drbd_rs_controller_reset(struct drbd_device *device) +void drbd_rs_controller_reset(struct drbd_peer_device *peer_device) { + struct drbd_device *device = peer_device->device; struct gendisk *disk = device->ldev->backing_bdev->bd_disk; struct fifo_buffer *plan; @@ -1891,10 +1897,10 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) rcu_read_unlock(); schedule_timeout_interruptible(timeo); } - drbd_resync_finished(device); + drbd_resync_finished(peer_device); } - drbd_rs_controller_reset(device); + drbd_rs_controller_reset(peer_device); /* ns.conn may already be != device->state.conn, * we may have been paused in between, or become paused until * the timer triggers. @@ -1909,8 +1915,9 @@ out: mutex_unlock(device->state_mutex); } -static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done) +static void update_on_disk_bitmap(struct drbd_peer_device *peer_device, bool resync_done) { + struct drbd_device *device = peer_device->device; struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; device->rs_last_bcast = jiffies; @@ -1919,7 +1926,7 @@ static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done) drbd_bm_write_lazy(device, 0); if (resync_done && is_sync_state(device->state.conn)) - drbd_resync_finished(device); + drbd_resync_finished(peer_device); drbd_bcast_event(device, &sib); /* update timestamp, in case it took a while to write out stuff */ @@ -1945,6 +1952,7 @@ static void drbd_ldev_destroy(struct drbd_device *device) static void go_diskless(struct drbd_device *device) { + struct drbd_peer_device *peer_device = first_peer_device(device); D_ASSERT(device, device->state.disk == D_FAILED); /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will * inc/dec it frequently. Once we are D_DISKLESS, no one will touch @@ -1970,7 +1978,7 @@ static void go_diskless(struct drbd_device *device) * Any modifications would not be expected anymore, though. */ if (drbd_bitmap_io_from_worker(device, drbd_bm_write, - "detach", BM_LOCKED_TEST_ALLOWED)) { + "detach", BM_LOCKED_TEST_ALLOWED, peer_device)) { if (test_bit(WAS_READ_ERROR, &device->flags)) { drbd_md_set_flag(device, MDF_FULL_SYNC); drbd_md_sync(device); @@ -2017,7 +2025,7 @@ static void do_device_work(struct drbd_device *device, const unsigned long todo) do_md_sync(device); if (test_bit(RS_DONE, &todo) || test_bit(RS_PROGRESS, &todo)) - update_on_disk_bitmap(device, test_bit(RS_DONE, &todo)); + update_on_disk_bitmap(first_peer_device(device), test_bit(RS_DONE, &todo)); if (test_bit(GO_DISKLESS, &todo)) go_diskless(device); if (test_bit(DESTROY_DISK, &todo)) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 592cfa8b765a..c0b1611b9665 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1934,11 +1934,11 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } } - if (!info->attrs[NBD_ATTR_SOCKETS]) { + if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SOCKETS)) { pr_err("must specify at least one socket\n"); return -EINVAL; } - if (!info->attrs[NBD_ATTR_SIZE_BYTES]) { + if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SIZE_BYTES)) { pr_err("must specify a size in bytes for the device\n"); return -EINVAL; } @@ -2123,7 +2123,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - if (!info->attrs[NBD_ATTR_INDEX]) { + if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) { pr_err("must specify an index to disconnect\n"); return -EINVAL; } @@ -2161,7 +2161,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - if (!info->attrs[NBD_ATTR_INDEX]) { + if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) { pr_err("must specify a device to reconfigure\n"); return -EINVAL; } @@ -2325,6 +2325,7 @@ static struct genl_family nbd_genl_family __ro_after_init = { .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops), .resv_start_op = NBD_CMD_STATUS + 1, .maxattr = NBD_ATTR_MAX, + .netnsok = 1, .policy = nbd_attr_policy, .mcgrps = nbd_mcast_grps, .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 9e6b032c8ecc..bc2c58724df3 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1030,8 +1030,8 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) if (!t_page) return -ENOMEM; - src = kmap_atomic(c_page->page); - dst = kmap_atomic(t_page->page); + src = kmap_local_page(c_page->page); + dst = kmap_local_page(t_page->page); for (i = 0; i < PAGE_SECTORS; i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { @@ -1043,8 +1043,8 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) } } - kunmap_atomic(dst); - kunmap_atomic(src); + kunmap_local(dst); + kunmap_local(src); ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); null_free_page(ret); @@ -1112,7 +1112,6 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, size_t temp, count = 0; unsigned int offset; struct nullb_page *t_page; - void *dst, *src; while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); @@ -1126,11 +1125,7 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, if (!t_page) return -ENOSPC; - src = kmap_atomic(source); - dst = kmap_atomic(t_page->page); - memcpy(dst + offset, src + off + count, temp); - kunmap_atomic(dst); - kunmap_atomic(src); + memcpy_page(t_page->page, offset, source, off + count, temp); __set_bit(sector & SECTOR_MASK, t_page->bitmap); @@ -1149,7 +1144,6 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest, size_t temp, count = 0; unsigned int offset; struct nullb_page *t_page; - void *dst, *src; while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); @@ -1158,16 +1152,11 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest, t_page = null_lookup_page(nullb, sector, false, !null_cache_active(nullb)); - dst = kmap_atomic(dest); - if (!t_page) { - memset(dst + off + count, 0, temp); - goto next; - } - src = kmap_atomic(t_page->page); - memcpy(dst + off + count, src + offset, temp); - kunmap_atomic(src); -next: - kunmap_atomic(dst); + if (t_page) + memcpy_page(dest, off + count, t_page->page, offset, + temp); + else + zero_user(dest, off + count, temp); count += temp; sector += temp >> SECTOR_SHIFT; @@ -1178,11 +1167,7 @@ next: static void nullb_fill_pattern(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off) { - void *dst; - - dst = kmap_atomic(page); - memset(dst + off, 0xFF, len); - kunmap_atomic(dst); + memset_page(page, off, 0xff, len); } blk_status_t null_handle_discard(struct nullb_device *dev, diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 604c1a13c76e..2a5b1e648f15 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -298,9 +298,7 @@ static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { - if (ubq->flags & UBLK_F_NEED_GET_DATA) - return true; - return false; + return ubq->flags & UBLK_F_NEED_GET_DATA; } static struct ublk_device *ublk_get_device(struct ublk_device *ub) @@ -349,25 +347,19 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) static inline bool ublk_queue_can_use_recovery_reissue( struct ublk_queue *ubq) { - if ((ubq->flags & UBLK_F_USER_RECOVERY) && - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) - return true; - return false; + return (ubq->flags & UBLK_F_USER_RECOVERY) && + (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); } static inline bool ublk_queue_can_use_recovery( struct ublk_queue *ubq) { - if (ubq->flags & UBLK_F_USER_RECOVERY) - return true; - return false; + return ubq->flags & UBLK_F_USER_RECOVERY; } static inline bool ublk_can_use_recovery(struct ublk_device *ub) { - if (ub->dev_info.flags & UBLK_F_USER_RECOVERY) - return true; - return false; + return ub->dev_info.flags & UBLK_F_USER_RECOVERY; } static void ublk_free_disk(struct gendisk *disk) @@ -428,10 +420,9 @@ static const struct block_device_operations ub_fops = { #define UBLK_MAX_PIN_PAGES 32 struct ublk_map_data { - const struct ublk_queue *ubq; const struct request *rq; - const struct ublk_io *io; - unsigned max_bytes; + unsigned long ubuf; + unsigned int len; }; struct ublk_io_iter { @@ -488,18 +479,17 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, return done; } -static inline int ublk_copy_user_pages(struct ublk_map_data *data, - bool to_vm) +static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm) { const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; - const unsigned long start_vm = data->io->addr; + const unsigned long start_vm = data->ubuf; unsigned int done = 0; struct ublk_io_iter iter = { .pg_off = start_vm & (PAGE_SIZE - 1), .bio = data->rq->bio, .iter = data->rq->bio->bi_iter, }; - const unsigned int nr_pages = round_up(data->max_bytes + + const unsigned int nr_pages = round_up(data->len + (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; while (done < nr_pages) { @@ -512,42 +502,49 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data, iter.pages); if (iter.nr_pages <= 0) return done == 0 ? iter.nr_pages : done; - len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm); + len = ublk_copy_io_pages(&iter, data->len, to_vm); for (i = 0; i < iter.nr_pages; i++) { if (to_vm) set_page_dirty(iter.pages[i]); put_page(iter.pages[i]); } - data->max_bytes -= len; + data->len -= len; done += iter.nr_pages; } return done; } +static inline bool ublk_need_map_req(const struct request *req) +{ + return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; +} + +static inline bool ublk_need_unmap_req(const struct request *req) +{ + return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ; +} + static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); + /* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current * context is pretty fast, see ublk_pin_user_pages */ - if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH) - return rq_bytes; - - if (ublk_rq_has_data(req)) { + if (ublk_need_map_req(req)) { struct ublk_map_data data = { - .ubq = ubq, .rq = req, - .io = io, - .max_bytes = rq_bytes, + .ubuf = io->addr, + .len = rq_bytes, }; ublk_copy_user_pages(&data, true); - return rq_bytes - data.max_bytes; + return rq_bytes - data.len; } return rq_bytes; } @@ -558,19 +555,18 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) { + if (ublk_need_unmap_req(req)) { struct ublk_map_data data = { - .ubq = ubq, .rq = req, - .io = io, - .max_bytes = io->res, + .ubuf = io->addr, + .len = io->res, }; WARN_ON_ONCE(io->res > rq_bytes); ublk_copy_user_pages(&data, false); - return io->res - data.max_bytes; + return io->res - data.len; } return rq_bytes; } @@ -655,14 +651,15 @@ static void ublk_complete_rq(struct request *req) struct ublk_queue *ubq = req->mq_hctx->driver_data; struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; + blk_status_t res = BLK_STS_OK; /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; if (io->res < 0) { - blk_mq_end_request(req, errno_to_blk_status(io->res)); - return; + res = errno_to_blk_status(io->res); + goto exit; } /* @@ -671,10 +668,8 @@ static void ublk_complete_rq(struct request *req) * * Both the two needn't unmap. */ - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) { - blk_mq_end_request(req, BLK_STS_OK); - return; - } + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) + goto exit; /* for READ request, writing data in iod->addr to rq buffers */ unmapped_bytes = ublk_unmap_io(ubq, req, io); @@ -691,6 +686,10 @@ static void ublk_complete_rq(struct request *req) blk_mq_requeue_request(req, true); else __blk_mq_end_request(req, BLK_STS_OK); + + return; +exit: + blk_mq_end_request(req, res); } /* @@ -771,9 +770,7 @@ static inline void __ublk_rq_task_work(struct request *req, return; } - if (ublk_need_get_data(ubq) && - (req_op(req) == REQ_OP_WRITE || - req_op(req) == REQ_OP_FLUSH)) { + if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { /* * We have not handled UBLK_IO_NEED_GET_DATA command yet, * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv diff --git a/drivers/char/random.c b/drivers/char/random.c index 253f2ddb8913..2b47b500126c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1546,7 +1546,7 @@ const struct file_operations random_fops = { .compat_ioctl = compat_ptr_ioctl, .fasync = random_fasync, .llseek = noop_llseek, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, }; @@ -1557,7 +1557,7 @@ const struct file_operations urandom_fops = { .compat_ioctl = compat_ptr_ioctl, .fasync = random_fasync, .llseek = noop_llseek, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index b1d6ca7e9708..f3d6ce45c397 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -267,6 +267,8 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) if (!HFI1_CAP_IS_KSET(SDMA)) return -EINVAL; + if (!from->user_backed) + return -EINVAL; idx = srcu_read_lock(&fd->pq_srcu); pq = srcu_dereference(fd->pq, &fd->pq_srcu); if (!cq || !pq) { @@ -274,11 +276,6 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) return -EIO; } - if (!iter_is_iovec(from) || !dim) { - srcu_read_unlock(&fd->pq_srcu, idx); - return -EINVAL; - } - trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim); if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) { @@ -287,11 +284,12 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) } while (dim) { + const struct iovec *iov = iter_iov(from); int ret; unsigned long count = 0; ret = hfi1_user_sdma_process_request( - fd, (struct iovec *)(from->iov + done), + fd, (struct iovec *)(iov + done), dim, &count); if (ret) { reqs = ret; diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 076098803f52..53b2911281bc 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -2244,10 +2244,10 @@ static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from) struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp); struct qib_user_sdma_queue *pq = fp->pq; - if (!iter_is_iovec(from) || !from->nr_segs || !pq) + if (!from->user_backed || !from->nr_segs || !pq) return -EINVAL; - return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs); + return qib_user_sdma_writev(rcd, pq, iter_iov(from), from->nr_segs); } static struct class *qib_class; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2055a758541d..7899f5fb4c13 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1202,21 +1202,12 @@ struct dm_crypto_profile { struct mapped_device *md; }; -struct dm_keyslot_evict_args { - const struct blk_crypto_key *key; - int err; -}; - static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct dm_keyslot_evict_args *args = data; - int err; + const struct blk_crypto_key *key = data; - err = blk_crypto_evict_key(dev->bdev, args->key); - if (!args->err) - args->err = err; - /* Always try to evict the key from all devices. */ + blk_crypto_evict_key(dev->bdev, key); return 0; } @@ -1229,7 +1220,6 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, { struct mapped_device *md = container_of(profile, struct dm_crypto_profile, profile)->md; - struct dm_keyslot_evict_args args = { key }; struct dm_table *t; int srcu_idx; @@ -1242,11 +1232,12 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, if (!ti->type->iterate_devices) continue; - ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args); + ti->type->iterate_devices(ti, dm_keyslot_evict_callback, + (void *)key); } dm_put_live_table(md, srcu_idx); - return args.err; + return 0; } static int diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 4c7f74904c25..d4d0a41a905a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1486,7 +1486,8 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { - size_t fragsz = it->iov[i].iov_len; + const struct iovec *iov = iter_iov(it); + size_t fragsz = iov->iov_len; struct page *page; void *frag; diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 36fb945fdad4..9d117e579dfb 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -466,7 +466,7 @@ static const struct file_operations tty_fops = { .llseek = no_llseek, .read_iter = tty_read, .write_iter = tty_write, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, @@ -481,7 +481,7 @@ static const struct file_operations console_fops = { .llseek = no_llseek, .read_iter = tty_read, .write_iter = redirected_tty_write, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 32d0be968103..e68f7d226bc9 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -665,7 +665,7 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) { int sgl_count = 0; - if (!iter || !iter->iov) { + if (!iter || !iter_iov(iter)) { pr_err("%s: iter->iov is NULL, but expected bytes: %zu" " present\n", __func__, bytes); return -EINVAL; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4b9433b96f4b..f53b7b75092d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3721,10 +3721,15 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, if (!iter_is_iovec(iter)) return 0; - for (seg = 0; seg < iter->nr_segs; seg++) - for (i = seg + 1; i < iter->nr_segs; i++) - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + const struct iovec *iov1 = iter_iov(iter) + seg; + const struct iovec *iov2 = iter_iov(iter) + i; + + if (iov1->iov_base == iov2->iov_base) return -EINVAL; + } + } return 0; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ac9034fce409..5fd67c13d44b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1362,7 +1362,7 @@ const struct file_operations cifs_file_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = cifs_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1382,7 +1382,7 @@ const struct file_operations cifs_file_strict_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = cifs_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1420,7 +1420,7 @@ const struct file_operations cifs_file_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = cifs_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1438,7 +1438,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = cifs_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 415176b2cf32..9321c161a27e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -100,9 +100,6 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); -extern ssize_t cifs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags); extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6831a9949c43..040bb30fb210 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -5066,19 +5066,3 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .launder_folio = cifs_launder_folio, .migrate_folio = filemap_migrate_folio, }; - -/* - * Splice data from a file into a pipe. - */ -ssize_t cifs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - if (unlikely(*ppos >= file_inode(in)->i_sb->s_maxbytes)) - return 0; - if (unlikely(!len)) - return 0; - if (in->f_flags & O_DIRECT) - return direct_splice_read(in, ppos, pipe, len, flags); - return filemap_splice_read(in, ppos, pipe, len, flags); -} diff --git a/fs/coda/file.c b/fs/coda/file.c index 3f3c81e6b1ab..12b26bd13564 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -23,6 +23,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/uio.h> +#include <linux/splice.h> #include <linux/coda.h> #include "coda_psdev.h" @@ -94,6 +95,32 @@ finish_write: return ret; } +static ssize_t +coda_file_splice_read(struct file *coda_file, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi = coda_ftoc(coda_file); + struct file *in = cfi->cfi_container; + loff_t ki_pos = *ppos; + ssize_t ret; + + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + len, ki_pos, CODA_ACCESS_TYPE_READ); + if (ret) + goto finish_read; + + ret = vfs_splice_read(in, ppos, pipe, len, flags); + +finish_read: + venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + len, ki_pos, CODA_ACCESS_TYPE_READ_FINISH); + return ret; +} + static void coda_vm_open(struct vm_area_struct *vma) { @@ -302,5 +329,5 @@ const struct file_operations coda_file_operations = { .open = coda_open, .release = coda_release, .fsync = coda_fsync, - .splice_read = generic_file_splice_read, + .splice_read = coda_file_splice_read, }; diff --git a/fs/direct-io.c b/fs/direct-io.c index 0b380bb8a81e..ad20f3428bab 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -402,6 +402,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_aio; else bio->bi_end_io = dio_bio_end_io; + /* for now require references for all pages */ + bio_set_flag(bio, BIO_PAGE_REFFED); sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0b8b4499e5ca..d101b3b0c7da 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -899,7 +899,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | + FMODE_DIO_PARALLEL_WRITE; return dquot_file_open(inode, filp); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index de37a3a06a71..89d97f6188e0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1419,7 +1419,7 @@ out: static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { - return (unsigned long)ii->iov->iov_base + ii->iov_offset; + return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset; } static inline size_t fuse_get_frag_size(const struct iov_iter *ii, diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f771001574d0..ceeb0a183cea 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -202,7 +202,6 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - get_page(page); __bio_add_page(bio, page, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e4a50e4ff0d2..9d23b8141db7 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -1011,7 +1011,7 @@ const struct file_operations kernfs_file_fops = { .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 7c04f033aadd..86197882ff35 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -419,6 +419,27 @@ out_unlock: return ret; } +static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + const struct cred *old_cred; + struct fd real; + ssize_t ret; + + ret = ovl_real_fdget(in, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(in)->i_sb); + ret = vfs_splice_read(real.file, ppos, pipe, len, flags); + revert_creds(old_cred); + ovl_file_accessed(in); + + fdput(real); + return ret; +} + /* * Calling iter_file_splice_write() directly from overlay's f_op may deadlock * due to lock order inversion between pipe->mutex in iter_file_splice_write() @@ -695,7 +716,7 @@ const struct file_operations ovl_file_operations = { .fallocate = ovl_fallocate, .fadvise = ovl_fadvise, .flush = ovl_flush, - .splice_read = generic_file_splice_read, + .splice_read = ovl_splice_read, .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f495fdb39151..711f12706469 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -591,7 +591,7 @@ static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, @@ -617,7 +617,7 @@ static const struct file_operations proc_reg_file_ops_compat = { static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 875771bf1f93..ff88a5da6840 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -868,7 +868,7 @@ static const struct file_operations proc_sys_file_operations = { .poll = proc_sys_poll, .read_iter = proc_sys_read, .write_iter = proc_sys_write, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .llseek = default_llseek, }; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 846f9455ae22..492abbbeff5e 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -324,7 +324,7 @@ static int mountstats_open(struct inode *inode, struct file *file) const struct file_operations proc_mounts_operations = { .open = mounts_open, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, @@ -333,7 +333,7 @@ const struct file_operations proc_mounts_operations = { const struct file_operations proc_mountinfo_operations = { .open = mountinfo_open, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, @@ -342,7 +342,7 @@ const struct file_operations proc_mountinfo_operations = { const struct file_operations proc_mountstats_operations = { .open = mountstats_open, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .llseek = seq_lseek, .release = mounts_release, }; diff --git a/fs/read_write.c b/fs/read_write.c index 7a2ff6157eda..a21ba3be7dbe 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -749,15 +749,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return -EOPNOTSUPP; while (iov_iter_count(iter)) { - struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; if (type == READ) { - nr = filp->f_op->read(filp, iovec.iov_base, - iovec.iov_len, ppos); + nr = filp->f_op->read(filp, iter_iov_addr(iter), + iter_iov_len(iter), ppos); } else { - nr = filp->f_op->write(filp, iovec.iov_base, - iovec.iov_len, ppos); + nr = filp->f_op->write(filp, iter_iov_addr(iter), + iter_iov_len(iter), ppos); } if (nr < 0) { @@ -766,7 +765,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, break; } ret += nr; - if (nr != iovec.iov_len) + if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } diff --git a/fs/splice.c b/fs/splice.c index 7a423ef8f854..4247933f9eaa 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -299,7 +299,7 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, struct kiocb kiocb; struct page **pages; ssize_t ret; - size_t used, npages, chunk, remain, reclaim; + size_t used, npages, chunk, remain, keep = 0; int i; /* Work out how much data we can actually add into the pipe */ @@ -313,7 +313,7 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, if (!bv) return -ENOMEM; - pages = (void *)(bv + npages); + pages = (struct page **)(bv + npages); npages = alloc_pages_bulk_array(GFP_USER, npages, pages); if (!npages) { kfree(bv); @@ -336,11 +336,8 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); - reclaim = npages * PAGE_SIZE; - remain = 0; if (ret > 0) { - reclaim -= ret; - remain = ret; + keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; file_accessed(in); } else if (ret < 0) { @@ -353,14 +350,12 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, } /* Free any pages that didn't get touched at all. */ - reclaim /= PAGE_SIZE; - if (reclaim) { - npages -= reclaim; - release_pages(pages + npages, reclaim); - } + if (keep < npages) + release_pages(pages + keep, npages - keep); /* Push the remaining pages into the pipe. */ - for (i = 0; i < npages; i++) { + remain = ret; + for (i = 0; i < keep; i++) { struct pipe_buffer *buf = pipe_head_buf(pipe); chunk = min_t(size_t, remain, PAGE_SIZE); @@ -396,29 +391,13 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct iov_iter to; - struct kiocb kiocb; - int ret; - - iov_iter_pipe(&to, ITER_DEST, pipe, len); - init_sync_kiocb(&kiocb, in); - kiocb.ki_pos = *ppos; - ret = call_read_iter(in, &kiocb, &to); - if (ret > 0) { - *ppos = kiocb.ki_pos; - file_accessed(in); - } else if (ret < 0) { - /* free what was emitted */ - pipe_discard_from(pipe, to.start_head); - /* - * callers of ->splice_read() expect -EAGAIN on - * "can't put anything in there", rather than -EFAULT. - */ - if (ret == -EFAULT) - ret = -EAGAIN; - } - - return ret; + if (unlikely(*ppos >= file_inode(in)->i_sb->s_maxbytes)) + return 0; + if (unlikely(!len)) + return 0; + if (in->f_flags & O_DIRECT) + return direct_splice_read(in, ppos, pipe, len, flags); + return filemap_splice_read(in, ppos, pipe, len, flags); } EXPORT_SYMBOL(generic_file_splice_read); @@ -860,12 +839,24 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, return out->f_op->splice_write(pipe, out, ppos, len, flags); } -/* - * Attempt to initiate a splice from a file to a pipe. +/** + * vfs_splice_read - Read data from a file and splice it into a pipe + * @in: File to splice from + * @ppos: Input file offset + * @pipe: Pipe to splice to + * @len: Number of bytes to splice + * @flags: Splice modifier flags (SPLICE_F_*) + * + * Splice the requested amount of data from the input file to the pipe. This + * is synchronous as the caller must hold the pipe lock across the entire + * operation. + * + * If successful, it returns the amount of data spliced, 0 if it hit the EOF or + * a hole and a negative error code otherwise. */ -static long do_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +long vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { unsigned int p_space; int ret; @@ -888,6 +879,7 @@ static long do_splice_to(struct file *in, loff_t *ppos, return warn_unsupported(in, "read"); return in->f_op->splice_read(in, ppos, pipe, len, flags); } +EXPORT_SYMBOL_GPL(vfs_splice_read); /** * splice_direct_to_actor - splices data directly between two non-pipes @@ -957,7 +949,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - ret = do_splice_to(in, &pos, pipe, len, flags); + ret = vfs_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto out_release; @@ -1105,7 +1097,7 @@ long splice_file_to_pipe(struct file *in, pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) - ret = do_splice_to(in, offset, opipe, len, flags); + ret = vfs_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 528fc538b6b9..aede746541f8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1171,7 +1171,8 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | + FMODE_DIO_PARALLEL_WRITE; return generic_file_open(inode, file); } diff --git a/include/linux/bio.h b/include/linux/bio.h index b3e7529ff55e..8588bcfbc6ef 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -229,7 +229,7 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) static inline bool bio_flagged(struct bio *bio, unsigned int bit) { - return (bio->bi_flags & (1U << bit)) != 0; + return bio->bi_flags & (1U << bit); } static inline void bio_set_flag(struct bio *bio, unsigned int bit) @@ -488,7 +488,8 @@ void zero_fill_bio(struct bio *bio); static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { - if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + if (bio_flagged(bio, BIO_PAGE_REFFED) || + bio_flagged(bio, BIO_PAGE_PINNED)) __bio_release_pages(bio, mark_dirty); } diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 1e3e5d0adf12..5e5822c18ee4 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -95,8 +95,8 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key); -int blk_crypto_evict_key(struct block_device *bdev, - const struct blk_crypto_key *key); +void blk_crypto_evict_key(struct block_device *bdev, + const struct blk_crypto_key *key); bool blk_crypto_config_supported_natively(struct block_device *bdev, const struct blk_crypto_config *cfg); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index de0b0c3e7395..06caacd77ed6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -57,8 +57,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) -/* already slept for hybrid poll */ -#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) /* ->timeout has been called, don't expire again */ #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) /* queue has elevator attached */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fb8843990d28..80670a641cc2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -318,7 +318,8 @@ struct bio { * bio flags */ enum { - BIO_NO_PAGE_REF, /* don't put release vec pages */ + BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */ + BIO_PAGE_REFFED, /* put pages in bio_release_pages() */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ BIO_QUIET, /* Make BIO Quiet */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 941304f17492..e3242e67a8e3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -44,12 +44,6 @@ extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; -/* Must be consistent with blk_mq_poll_stats_bkt() */ -#define BLK_MQ_POLL_STATS_BKTS 16 - -/* Doing classic polling */ -#define BLK_MQ_POLL_CLASSIC -1 - /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. @@ -468,10 +462,6 @@ struct request_queue { #endif unsigned int rq_timeout; - int poll_nsec; - - struct blk_stat_callback *poll_cb; - struct blk_rq_stat *poll_stat; struct timer_list timeout; struct work_struct timeout_work; @@ -870,8 +860,6 @@ blk_status_t errno_to_blk_status(int errno); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) -/* do not sleep to wait for the expected completion time */ -#define BLK_POLL_NOSLEEP (1 << 1) int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, unsigned int flags); diff --git a/include/linux/fs.h b/include/linux/fs.h index ef2281a2acce..67495ef79bb2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -168,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NOREUSE ((__force fmode_t)0x800000) +/* File supports non-exclusive O_DIRECT writes from multiple threads */ +#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 4a4b387181ad..2984b0cb24b1 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -209,7 +209,7 @@ static int s_name ## _from_attrs_for_change(struct s_name *s, \ * Magic: define op number to op name mapping {{{1 * {{{2 */ -const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) +static const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) { switch (cmd) { #undef GENL_op diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 00689c12f6ab..fa621a508a01 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -188,8 +188,10 @@ struct io_ev_fd { }; struct io_alloc_cache { - struct hlist_head list; + struct io_wq_work_node list; unsigned int nr_cached; + unsigned int max_cached; + size_t elem_size; }; struct io_ring_ctx { @@ -239,7 +241,6 @@ struct io_ring_ctx { * uring_lock, and updated through io_uring_register(2) */ struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; atomic_t cancel_seq; struct io_file_table file_table; unsigned nr_user_files; @@ -295,7 +296,7 @@ struct io_ring_ctx { spinlock_t completion_lock; bool poll_multi_queue; - bool cq_waiting; + atomic_t cq_wait_nr; /* * ->iopoll_list is protected by the ctx->uring_lock for @@ -330,11 +331,9 @@ struct io_ring_ctx { struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; - struct delayed_work rsrc_put_work; - struct callback_head rsrc_put_tw; - struct llist_head rsrc_put_llist; + /* protected by ->uring_lock */ struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; + struct io_alloc_cache rsrc_node_cache; struct list_head io_buffers_pages; @@ -366,6 +365,11 @@ struct io_ring_ctx { unsigned evfd_last_cq_tail; }; +struct io_tw_state { + /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */ + bool locked; +}; + enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -472,7 +476,7 @@ enum { REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); struct io_task_work { struct llist_node node; @@ -562,6 +566,7 @@ struct io_kiocb { atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; + unsigned nr_tw; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ union { struct hlist_node hash_node; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 31ac562a17d7..042c1e2cb0ce 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -45,6 +45,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_WRITE_SHADOW_MBR: case IOC_OPAL_GENERIC_TABLE_RW: case IOC_OPAL_GET_STATUS: + case IOC_OPAL_GET_LR_STATUS: return true; } return false; diff --git a/include/linux/splice.h b/include/linux/splice.h index a55179fd60fc..8f052c3dae95 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -76,6 +76,9 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *, struct splice_pipe_desc *); extern ssize_t add_to_pipe(struct pipe_inode_info *, struct pipe_buffer *); +long vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); extern long do_splice(struct file *in, loff_t *off_in, diff --git a/include/linux/uio.h b/include/linux/uio.h index 29eb18bb6feb..c459e1d5772b 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,7 +11,6 @@ #include <uapi/linux/uio.h> struct page; -struct pipe_inode_info; typedef unsigned int __bitwise iov_iter_extraction_t; @@ -25,7 +24,6 @@ enum iter_type { ITER_IOVEC, ITER_KVEC, ITER_BVEC, - ITER_PIPE, ITER_XARRAY, ITER_DISCARD, ITER_UBUF, @@ -45,29 +43,52 @@ struct iov_iter { bool nofault; bool data_source; bool user_backed; + size_t iov_offset; + /* + * Hack alert: overlay ubuf_iovec with iovec + count, so + * that the members resolve correctly regardless of the type + * of iterator used. This means that you can use: + * + * &iter->__ubuf_iovec or iter->__iov + * + * interchangably for the user_backed cases, hence simplifying + * some of the cases that need to deal with both. + */ union { - size_t iov_offset; - int last_offset; - }; - size_t count; - union { - const struct iovec *iov; - const struct kvec *kvec; - const struct bio_vec *bvec; - struct xarray *xarray; - struct pipe_inode_info *pipe; - void __user *ubuf; + /* + * This really should be a const, but we cannot do that without + * also modifying any of the zero-filling iter init functions. + * Leave it non-const for now, but it should be treated as such. + */ + struct iovec __ubuf_iovec; + struct { + union { + /* use iter_iov() to get the current vec */ + const struct iovec *__iov; + const struct kvec *kvec; + const struct bio_vec *bvec; + struct xarray *xarray; + void __user *ubuf; + }; + size_t count; + }; }; union { unsigned long nr_segs; - struct { - unsigned int head; - unsigned int start_head; - }; loff_t xarray_start; }; }; +static inline const struct iovec *iter_iov(const struct iov_iter *iter) +{ + if (iter->iter_type == ITER_UBUF) + return (const struct iovec *) &iter->__ubuf_iovec; + return iter->__iov; +} + +#define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) +#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) + static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; @@ -101,11 +122,6 @@ static inline bool iov_iter_is_bvec(const struct iov_iter *i) return iov_iter_type(i) == ITER_BVEC; } -static inline bool iov_iter_is_pipe(const struct iov_iter *i) -{ - return iov_iter_type(i) == ITER_PIPE; -} - static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; @@ -143,15 +159,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) return ret; } -static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) -{ - return (struct iovec) { - .iov_base = iter->iov->iov_base + iter->iov_offset, - .iov_len = min(iter->count, - iter->iov->iov_len - iter->iov_offset), - }; -} - size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); @@ -249,19 +256,11 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); -void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, - size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); -ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, - size_t maxsize, unsigned maxpages, size_t *start, - iov_iter_extraction_t extraction_flags); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); -ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, - struct page ***pages, size_t maxsize, size_t *start, - iov_iter_extraction_t extraction_flags); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); @@ -361,7 +360,8 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, .user_backed = true, .data_source = direction, .ubuf = buf, - .count = count + .count = count, + .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 936fd41bf147..69454f1f98b0 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -360,19 +360,18 @@ TRACE_EVENT(io_uring_complete, ); /** - * io_uring_submit_sqe - called before submitting one SQE + * io_uring_submit_req - called before submitting a request * * @req: pointer to a submitted request - * @force_nonblock: whether a context blocking or not * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ -TRACE_EVENT(io_uring_submit_sqe, +TRACE_EVENT(io_uring_submit_req, - TP_PROTO(struct io_kiocb *req, bool force_nonblock), + TP_PROTO(struct io_kiocb *req), - TP_ARGS(req, force_nonblock), + TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) @@ -380,7 +379,6 @@ TRACE_EVENT(io_uring_submit_sqe, __field( unsigned long long, user_data ) __field( u8, opcode ) __field( u32, flags ) - __field( bool, force_nonblock ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) @@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->flags = req->flags; - __entry->force_nonblock = force_nonblock; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " - "non block %d, sq_thread %d", __entry->ctx, __entry->req, + "sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), - __entry->flags, __entry->force_nonblock, __entry->sq_thread) + __entry->flags, __entry->sq_thread) ); /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 709de6d4feb2..f8d14d1c58d3 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -389,6 +389,9 @@ enum { #define IORING_OFF_SQ_RING 0ULL #define IORING_OFF_CQ_RING 0x8000000ULL #define IORING_OFF_SQES 0x10000000ULL +#define IORING_OFF_PBUF_RING 0x80000000ULL +#define IORING_OFF_PBUF_SHIFT 16 +#define IORING_OFF_MMAP_MASK 0xf8000000ULL /* * Filled with the offset for mmap(2) @@ -568,19 +571,6 @@ struct io_uring_rsrc_update2 { __u32 resv2; }; -struct io_uring_notification_slot { - __u64 tag; - __u64 resv[3]; -}; - -struct io_uring_notification_register { - __u32 nr_slots; - __u32 resv; - __u64 resv2; - __u64 data; - __u64 resv3; -}; - /* Skip updating fd indexes set to this value in the fd table */ #define IORING_REGISTER_FILES_SKIP (-2) @@ -635,12 +625,26 @@ struct io_uring_buf_ring { }; }; +/* + * Flags for IORING_REGISTER_PBUF_RING. + * + * IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring. + * The application must not set a ring_addr in struct + * io_uring_buf_reg, instead it must subsequently call + * mmap(2) with the offset set as: + * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) + * to get a virtual mapping for the ring. + */ +enum { + IOU_PBUF_RING_MMAP = 1, +}; + /* argument for IORING_(UN)REGISTER_PBUF_RING */ struct io_uring_buf_reg { __u64 ring_addr; __u32 ring_entries; __u16 bgid; - __u16 pad; + __u16 flags; __u64 resv[3]; }; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index d7a1524023db..3905c8ffedbf 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -78,6 +78,16 @@ struct opal_user_lr_setup { struct opal_session_info session; }; +struct opal_lr_status { + struct opal_session_info session; + __u64 range_start; + __u64 range_length; + __u32 RLE; /* Read Lock enabled */ + __u32 WLE; /* Write Lock Enabled */ + __u32 l_state; + __u8 align[4]; +}; + struct opal_lock_unlock { struct opal_session_info session; __u32 l_state; @@ -168,5 +178,6 @@ struct opal_status { #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) #define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) #define IOC_OPAL_GET_STATUS _IOR('p', 236, struct opal_status) +#define IOC_OPAL_GET_LR_STATUS _IOW('p', 237, struct opal_lr_status) #endif /* _UAPI_SED_OPAL_H */ diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index c2cde88aeed5..851a527afb5e 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -7,15 +7,17 @@ #define IO_ALLOC_CACHE_MAX 512 struct io_cache_entry { - struct hlist_node node; + struct io_wq_work_node node; }; static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, struct io_cache_entry *entry) { - if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { + if (cache->nr_cached < cache->max_cached) { cache->nr_cached++; - hlist_add_head(&entry->node, &cache->list); + wq_stack_add_head(&entry->node, &cache->list); + /* KASAN poisons object */ + kasan_slab_free_mempool(entry); return true; } return false; @@ -23,31 +25,37 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) { - if (!hlist_empty(&cache->list)) { - struct hlist_node *node = cache->list.first; + if (cache->list.next) { + struct io_cache_entry *entry; - hlist_del(node); + entry = container_of(cache->list.next, struct io_cache_entry, node); + kasan_unpoison_range(entry, cache->elem_size); + cache->list.next = cache->list.next->next; cache->nr_cached--; - return container_of(node, struct io_cache_entry, node); + return entry; } return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache) +static inline void io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, size_t size) { - INIT_HLIST_HEAD(&cache->list); + cache->list.next = NULL; cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, void (*free)(struct io_cache_entry *)) { - while (!hlist_empty(&cache->list)) { - struct hlist_node *node = cache->list.first; + while (1) { + struct io_cache_entry *entry = io_alloc_cache_get(cache); - hlist_del(node); - free(container_of(node, struct io_cache_entry, node)); + if (!entry) + break; + free(entry); } cache->nr_cached = 0; } diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index f81c0a7136a5..b2715988791e 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -15,6 +15,7 @@ #include <linux/cpu.h> #include <linux/task_work.h> #include <linux/audit.h> +#include <linux/mmu_context.h> #include <uapi/linux/io_uring.h> #include "io-wq.h" @@ -39,7 +40,7 @@ enum { }; /* - * One for each thread in a wqe pool + * One for each thread in a wq pool */ struct io_worker { refcount_t ref; @@ -47,7 +48,7 @@ struct io_worker { struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; - struct io_wqe *wqe; + struct io_wq *wq; struct io_wq_work *cur_work; struct io_wq_work *next_work; @@ -73,7 +74,7 @@ struct io_worker { #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) -struct io_wqe_acct { +struct io_wq_acct { unsigned nr_workers; unsigned max_workers; int index; @@ -90,26 +91,6 @@ enum { }; /* - * Per-node worker thread pool - */ -struct io_wqe { - raw_spinlock_t lock; - struct io_wqe_acct acct[IO_WQ_ACCT_NR]; - - int node; - - struct hlist_nulls_head free_list; - struct list_head all_list; - - struct wait_queue_entry wait; - - struct io_wq *wq; - struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; - - cpumask_var_t cpu_mask; -}; - -/* * Per io_wq state */ struct io_wq { @@ -127,7 +108,19 @@ struct io_wq { struct task_struct *task; - struct io_wqe *wqes[]; + struct io_wq_acct acct[IO_WQ_ACCT_NR]; + + /* lock protects access to elements below */ + raw_spinlock_t lock; + + struct hlist_nulls_head free_list; + struct list_head all_list; + + struct wait_queue_entry wait; + + struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; + + cpumask_var_t cpu_mask; }; static enum cpuhp_state io_wq_online; @@ -140,10 +133,10 @@ struct io_cb_cancel_data { bool cancel_all; }; -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); -static void io_wqe_dec_running(struct io_worker *worker); -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, +static bool create_io_worker(struct io_wq *wq, int index); +static void io_wq_dec_running(struct io_worker *worker); +static bool io_acct_cancel_pending_work(struct io_wq *wq, + struct io_wq_acct *acct, struct io_cb_cancel_data *match); static void create_worker_cb(struct callback_head *cb); static void io_wq_cancel_tw_create(struct io_wq *wq); @@ -159,20 +152,20 @@ static void io_worker_release(struct io_worker *worker) complete(&worker->ref_done); } -static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound) +static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) { - return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; + return &wq->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; } -static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, - struct io_wq_work *work) +static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, + struct io_wq_work *work) { - return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); } -static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) +static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); } static void io_worker_ref_put(struct io_wq *wq) @@ -183,14 +176,13 @@ static void io_worker_ref_put(struct io_wq *wq) static void io_worker_cancel_cb(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); + struct io_wq *wq = worker->wq; atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); + raw_spin_unlock(&wq->lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -208,8 +200,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data) static void io_worker_exit(struct io_worker *worker) { - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; while (1) { struct callback_head *cb = task_work_cancel_match(wq->task, @@ -223,23 +214,23 @@ static void io_worker_exit(struct io_worker *worker) io_worker_release(worker); wait_for_completion(&worker->ref_done); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - raw_spin_unlock(&wqe->lock); - io_wqe_dec_running(worker); + raw_spin_unlock(&wq->lock); + io_wq_dec_running(worker); worker->flags = 0; preempt_disable(); current->flags &= ~PF_IO_WORKER; preempt_enable(); kfree_rcu(worker, rcu); - io_worker_ref_put(wqe->wq); + io_worker_ref_put(wq); do_exit(0); } -static inline bool io_acct_run_queue(struct io_wqe_acct *acct) +static inline bool io_acct_run_queue(struct io_wq_acct *acct) { bool ret = false; @@ -256,8 +247,8 @@ static inline bool io_acct_run_queue(struct io_wqe_acct *acct) * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wqe_activate_free_worker(struct io_wqe *wqe, - struct io_wqe_acct *acct) +static bool io_wq_activate_free_worker(struct io_wq *wq, + struct io_wq_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -268,10 +259,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, * activate. If a given worker is on the free_list but in the process * of exiting, keep trying. */ - hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { + hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { if (!io_worker_get(worker)) continue; - if (io_wqe_get_acct(worker) != acct) { + if (io_wq_get_acct(worker) != acct) { io_worker_release(worker); continue; } @@ -289,7 +280,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, * We need a worker. If we find a free one, we're good. If not, and we're * below the max number of workers, create one. */ -static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) +static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) { /* * Most likely an attempt to queue unbounded work on an io_wq that @@ -298,21 +289,21 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); return true; } acct->nr_workers++; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - return create_io_worker(wqe->wq, wqe, acct->index); + atomic_inc(&wq->worker_refs); + return create_io_worker(wq, acct->index); } -static void io_wqe_inc_running(struct io_worker *worker) +static void io_wq_inc_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_inc(&acct->nr_running); } @@ -321,22 +312,22 @@ static void create_worker_cb(struct callback_head *cb) { struct io_worker *worker; struct io_wq *wq; - struct io_wqe *wqe; - struct io_wqe_acct *acct; + + struct io_wq_acct *acct; bool do_create = false; worker = container_of(cb, struct io_worker, create_work); - wqe = worker->wqe; - wq = wqe->wq; - acct = &wqe->acct[worker->create_index]; - raw_spin_lock(&wqe->lock); + wq = worker->wq; + acct = &wq->acct[worker->create_index]; + raw_spin_lock(&wq->lock); + if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; do_create = true; } - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); if (do_create) { - create_io_worker(wq, wqe, worker->create_index); + create_io_worker(wq, worker->create_index); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -346,11 +337,10 @@ static void create_worker_cb(struct callback_head *cb) } static bool io_queue_worker_create(struct io_worker *worker, - struct io_wqe_acct *acct, + struct io_wq_acct *acct, task_work_func_t func) { - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; /* raced with exit, just ignore create call */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) @@ -392,10 +382,10 @@ fail: return false; } -static void io_wqe_dec_running(struct io_worker *worker) +static void io_wq_dec_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; + struct io_wq_acct *acct = io_wq_get_acct(worker); + struct io_wq *wq = worker->wq; if (!(worker->flags & IO_WORKER_F_UP)) return; @@ -406,7 +396,7 @@ static void io_wqe_dec_running(struct io_worker *worker) return; atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); + atomic_inc(&wq->worker_refs); io_queue_worker_create(worker, acct, create_worker_cb); } @@ -414,29 +404,25 @@ static void io_wqe_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) +static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) { if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); } } /* - * No work, worker going to sleep. Move to freelist, and unuse mm if we - * have one attached. Dropping the mm may potentially sleep, so we drop - * the lock in that case and return success. Since the caller has to - * retry the loop in that case (we changed task state), we don't regrab - * the lock if we return success. + * No work, worker going to sleep. Move to freelist. */ -static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) - __must_hold(wqe->lock) +static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) + __must_hold(wq->lock) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); } } @@ -445,17 +431,16 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) return work->flags >> IO_WQ_HASH_SHIFT; } -static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) +static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) { - struct io_wq *wq = wqe->wq; bool ret = false; spin_lock_irq(&wq->hash->wait.lock); - if (list_empty(&wqe->wait.entry)) { - __add_wait_queue(&wq->hash->wait, &wqe->wait); + if (list_empty(&wq->wait.entry)) { + __add_wait_queue(&wq->hash->wait, &wq->wait); if (!test_bit(hash, &wq->hash->map)) { __set_current_state(TASK_RUNNING); - list_del_init(&wqe->wait.entry); + list_del_init(&wq->wait.entry); ret = true; } } @@ -463,14 +448,14 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) return ret; } -static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, +static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, struct io_worker *worker) __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; - struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { unsigned int hash; @@ -485,11 +470,11 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, hash = io_get_work_hash(work); /* all items with this hash lie in [work, tail] */ - tail = wqe->hash_tail[hash]; + tail = wq->hash_tail[hash]; /* hashed, can run if not already running */ - if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { - wqe->hash_tail[hash] = NULL; + if (!test_and_set_bit(hash, &wq->hash->map)) { + wq->hash_tail[hash] = NULL; wq_list_cut(&acct->work_list, &tail->list, prev); return work; } @@ -508,12 +493,12 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, */ set_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); - unstalled = io_wait_on_hash(wqe, stall_hash); + unstalled = io_wait_on_hash(wq, stall_hash); raw_spin_lock(&acct->lock); if (unstalled) { clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - if (wq_has_sleeper(&wqe->wq->hash->wait)) - wake_up(&wqe->wq->hash->wait); + if (wq_has_sleeper(&wq->hash->wait)) + wake_up(&wq->hash->wait); } } @@ -534,13 +519,10 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_unlock(&worker->lock); } -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); - static void io_worker_handle_work(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); + struct io_wq *wq = worker->wq; bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { @@ -557,7 +539,7 @@ static void io_worker_handle_work(struct io_worker *worker) work = io_get_next_work(acct, worker); raw_spin_unlock(&acct->lock); if (work) { - __io_worker_busy(wqe, worker); + __io_worker_busy(wq, worker); /* * Make sure cancelation can find this, even before @@ -595,7 +577,7 @@ static void io_worker_handle_work(struct io_worker *worker) } io_assign_current_work(worker, work); if (linked) - io_wqe_enqueue(wqe, linked); + io_wq_enqueue(wq, linked); if (hash != -1U && !next_hashed) { /* serialize hash clear with wake_up() */ @@ -610,12 +592,11 @@ static void io_worker_handle_work(struct io_worker *worker) } while (1); } -static int io_wqe_worker(void *data) +static int io_wq_worker(void *data) { struct io_worker *worker = data; - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); + struct io_wq *wq = worker->wq; bool exit_mask = false, last_timeout = false; char buf[TASK_COMM_LEN]; @@ -631,20 +612,20 @@ static int io_wqe_worker(void *data) while (io_acct_run_queue(acct)) io_worker_handle_work(worker); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); /* * Last sleep timed out. Exit if we're not the last worker, * or if someone modified our affinity. */ if (last_timeout && (exit_mask || acct->nr_workers > 1)) { acct->nr_workers--; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); __set_current_state(TASK_RUNNING); break; } last_timeout = false; - __io_worker_idle(wqe, worker); - raw_spin_unlock(&wqe->lock); + __io_worker_idle(wq, worker); + raw_spin_unlock(&wq->lock); if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -658,7 +639,7 @@ static int io_wqe_worker(void *data) if (!ret) { last_timeout = true; exit_mask = !cpumask_test_cpu(raw_smp_processor_id(), - wqe->cpu_mask); + wq->cpu_mask); } } @@ -683,7 +664,7 @@ void io_wq_worker_running(struct task_struct *tsk) if (worker->flags & IO_WORKER_F_RUNNING) return; worker->flags |= IO_WORKER_F_RUNNING; - io_wqe_inc_running(worker); + io_wq_inc_running(worker); } /* @@ -702,21 +683,21 @@ void io_wq_worker_sleeping(struct task_struct *tsk) return; worker->flags &= ~IO_WORKER_F_RUNNING; - io_wqe_dec_running(worker); + io_wq_dec_running(worker); } -static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, +static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, struct task_struct *tsk) { tsk->worker_private = worker; worker->task = tsk; - set_cpus_allowed_ptr(tsk, wqe->cpu_mask); + set_cpus_allowed_ptr(tsk, wq->cpu_mask); - raw_spin_lock(&wqe->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - list_add_tail_rcu(&worker->all_list, &wqe->all_list); + raw_spin_lock(&wq->lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); + list_add_tail_rcu(&worker->all_list, &wq->all_list); worker->flags |= IO_WORKER_F_FREE; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); wake_up_new_task(tsk); } @@ -749,21 +730,21 @@ static void create_worker_cont(struct callback_head *cb) { struct io_worker *worker; struct task_struct *tsk; - struct io_wqe *wqe; + struct io_wq *wq; worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); - wqe = worker->wqe; - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + wq = worker->wq; + tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); + io_init_new_worker(wq, worker, tsk); io_worker_release(worker); return; } else if (!io_should_retry_thread(PTR_ERR(tsk))) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; if (!acct->nr_workers) { struct io_cb_cancel_data match = { @@ -771,13 +752,13 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; - raw_spin_unlock(&wqe->lock); - while (io_acct_cancel_pending_work(wqe, acct, &match)) + raw_spin_unlock(&wq->lock); + while (io_acct_cancel_pending_work(wq, acct, &match)) ; } else { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); } - io_worker_ref_put(wqe->wq); + io_worker_ref_put(wq); kfree(worker); return; } @@ -790,42 +771,42 @@ static void create_worker_cont(struct callback_head *cb) static void io_workqueue_create(struct work_struct *work) { struct io_worker *worker = container_of(work, struct io_worker, work); - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); } -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static bool create_io_worker(struct io_wq *wq, int index) { - struct io_wqe_acct *acct = &wqe->acct[index]; + struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; struct task_struct *tsk; __set_current_state(TASK_RUNNING); - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); + worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) { fail: atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); io_worker_ref_put(wq); return false; } refcount_set(&worker->ref, 1); - worker->wqe = wqe; + worker->wq = wq; raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) worker->flags |= IO_WORKER_F_BOUND; - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); + io_init_new_worker(wq, worker, tsk); } else if (!io_should_retry_thread(PTR_ERR(tsk))) { kfree(worker); goto fail; @@ -841,14 +822,14 @@ fail: * Iterate the passed in list and call the specific function for each * worker that isn't exiting */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, +static bool io_wq_for_each_worker(struct io_wq *wq, bool (*func)(struct io_worker *, void *), void *data) { struct io_worker *worker; bool ret = false; - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + list_for_each_entry_rcu(worker, &wq->all_list, all_list) { if (io_worker_get(worker)) { /* no task if node is/was offline */ if (worker->task) @@ -869,10 +850,8 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } -static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) +static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { - struct io_wq *wq = wqe->wq; - do { work->flags |= IO_WQ_WORK_CANCEL; wq->do_work(work); @@ -880,9 +859,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) } while (work); } -static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash; struct io_wq_work *tail; @@ -893,8 +872,8 @@ append: } hash = io_get_work_hash(work); - tail = wqe->hash_tail[hash]; - wqe->hash_tail[hash] = work; + tail = wq->hash_tail[hash]; + wq->hash_tail[hash] = work; if (!tail) goto append; @@ -906,9 +885,9 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) return work == data; } -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) +void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); struct io_cb_cancel_data match; unsigned work_flags = work->flags; bool do_create; @@ -917,55 +896,48 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) * If io-wq is exiting for this task, or if the request has explicitly * been marked as one that should not get executed, cancel it here. */ - if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) || + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || (work->flags & IO_WQ_WORK_CANCEL)) { - io_run_cancel(work, wqe); + io_run_cancel(work, wq); return; } raw_spin_lock(&acct->lock); - io_wqe_insert_work(wqe, work); + io_wq_insert_work(wq, work); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); rcu_read_lock(); - do_create = !io_wqe_activate_free_worker(wqe, acct); + do_create = !io_wq_activate_free_worker(wq, acct); rcu_read_unlock(); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running))) { bool did_create; - did_create = io_wqe_create_worker(wqe, acct); + did_create = io_wq_create_worker(wq, acct); if (likely(did_create)) return; - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (acct->nr_workers) { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); return; } - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); /* fatal condition, failed to create the first worker */ match.fn = io_wq_work_match_item, match.data = work, match.cancel_all = false, - io_acct_cancel_pending_work(wqe, acct, &match); + io_acct_cancel_pending_work(wq, acct, &match); } } -void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) -{ - struct io_wqe *wqe = wq->wqes[numa_node_id()]; - - io_wqe_enqueue(wqe, work); -} - /* * Work items that hash to the same value will not be done in parallel. * Used to limit concurrent writes, generally hashed by inode. @@ -1008,27 +980,27 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return match->nr_running && !match->cancel_all; } -static inline void io_wqe_remove_pending(struct io_wqe *wqe, +static inline void io_wq_remove_pending(struct io_wq *wq, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; - if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) { + if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) { if (prev) prev_work = container_of(prev, struct io_wq_work, list); if (prev_work && io_get_work_hash(prev_work) == hash) - wqe->hash_tail[hash] = prev_work; + wq->hash_tail[hash] = prev_work; else - wqe->hash_tail[hash] = NULL; + wq->hash_tail[hash] = NULL; } wq_list_del(&acct->work_list, &work->list, prev); } -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, +static bool io_acct_cancel_pending_work(struct io_wq *wq, + struct io_wq_acct *acct, struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; @@ -1039,9 +1011,9 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - io_wqe_remove_pending(wqe, work, prev); + io_wq_remove_pending(wq, work, prev); raw_spin_unlock(&acct->lock); - io_run_cancel(work, wqe); + io_run_cancel(work, wq); match->nr_pending++; /* not safe to continue after unlock */ return true; @@ -1051,15 +1023,15 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe, return false; } -static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) +static void io_wq_cancel_pending_work(struct io_wq *wq, + struct io_cb_cancel_data *match) { int i; retry: for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); + struct io_wq_acct *acct = io_get_acct(wq, i == 0); - if (io_acct_cancel_pending_work(wqe, acct, match)) { + if (io_acct_cancel_pending_work(wq, acct, match)) { if (match->cancel_all) goto retry; break; @@ -1067,11 +1039,11 @@ retry: } } -static void io_wqe_cancel_running_work(struct io_wqe *wqe, +static void io_wq_cancel_running_work(struct io_wq *wq, struct io_cb_cancel_data *match) { rcu_read_lock(); - io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); + io_wq_for_each_worker(wq, io_wq_worker_cancel, match); rcu_read_unlock(); } @@ -1083,7 +1055,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, .data = data, .cancel_all = cancel_all, }; - int node; /* * First check pending list, if we're lucky we can just remove it @@ -1095,22 +1066,18 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. * - * Do both of these while holding the wqe->lock, to ensure that + * Do both of these while holding the wq->lock, to ensure that * we'll find a work item regardless of state. */ - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) - return IO_WQ_CANCEL_OK; - - raw_spin_lock(&wqe->lock); - io_wqe_cancel_running_work(wqe, &match); - raw_spin_unlock(&wqe->lock); - if (match.nr_running && !match.cancel_all) - return IO_WQ_CANCEL_RUNNING; - } + io_wq_cancel_pending_work(wq, &match); + if (match.nr_pending && !match.cancel_all) + return IO_WQ_CANCEL_OK; + + raw_spin_lock(&wq->lock); + io_wq_cancel_running_work(wq, &match); + raw_spin_unlock(&wq->lock); + if (match.nr_running && !match.cancel_all) + return IO_WQ_CANCEL_RUNNING; if (match.nr_running) return IO_WQ_CANCEL_RUNNING; @@ -1119,20 +1086,20 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } -static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, +static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); + struct io_wq *wq = container_of(wait, struct io_wq, wait); int i; list_del_init(&wait->entry); rcu_read_lock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; + struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wqe_activate_free_worker(wqe, acct); + io_wq_activate_free_worker(wq, acct); } rcu_read_unlock(); return 1; @@ -1140,7 +1107,7 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { - int ret, node, i; + int ret, i; struct io_wq *wq; if (WARN_ON_ONCE(!data->free_work || !data->do_work)) @@ -1148,7 +1115,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); - wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL); + wq = kzalloc(sizeof(struct io_wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); @@ -1161,39 +1128,28 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wq->do_work = data->do_work; ret = -ENOMEM; - for_each_node(node) { - struct io_wqe *wqe; - int alloc_node = node; - - if (!node_online(alloc_node)) - alloc_node = NUMA_NO_NODE; - wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); - if (!wqe) - goto err; - wq->wqes[node] = wqe; - if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) - goto err; - cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); - wqe->node = alloc_node; - wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = - task_rlimit(current, RLIMIT_NPROC); - INIT_LIST_HEAD(&wqe->wait.entry); - wqe->wait.func = io_wqe_hash_wake; - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; - - acct->index = i; - atomic_set(&acct->nr_running, 0); - INIT_WQ_LIST(&acct->work_list); - raw_spin_lock_init(&acct->lock); - } - wqe->wq = wq; - raw_spin_lock_init(&wqe->lock); - INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_LIST_HEAD(&wqe->all_list); + + if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) + goto err; + cpumask_copy(wq->cpu_mask, cpu_possible_mask); + wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; + wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = + task_rlimit(current, RLIMIT_NPROC); + INIT_LIST_HEAD(&wq->wait.entry); + wq->wait.func = io_wq_hash_wake; + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + struct io_wq_acct *acct = &wq->acct[i]; + + acct->index = i; + atomic_set(&acct->nr_running, 0); + INIT_WQ_LIST(&acct->work_list); + raw_spin_lock_init(&acct->lock); } + raw_spin_lock_init(&wq->lock); + INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); + INIT_LIST_HEAD(&wq->all_list); + wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); @@ -1201,12 +1157,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - for_each_node(node) { - if (!wq->wqes[node]) - continue; - free_cpumask_var(wq->wqes[node]->cpu_mask); - kfree(wq->wqes[node]); - } + + free_cpumask_var(wq->cpu_mask); err_wq: kfree(wq); return ERR_PTR(ret); @@ -1219,7 +1171,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data) if (cb->func != create_worker_cb && cb->func != create_worker_cont) return false; worker = container_of(cb, struct io_worker, create_work); - return worker->wqe->wq == data; + return worker->wq == data; } void io_wq_exit_start(struct io_wq *wq) @@ -1247,48 +1199,35 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) static void io_wq_exit_workers(struct io_wq *wq) { - int node; - if (!wq->task) return; io_wq_cancel_tw_create(wq); rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - } + io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); rcu_read_unlock(); io_worker_ref_put(wq); wait_for_completion(&wq->worker_done); - for_each_node(node) { - spin_lock_irq(&wq->hash->wait.lock); - list_del_init(&wq->wqes[node]->wait.entry); - spin_unlock_irq(&wq->hash->wait.lock); - } + spin_lock_irq(&wq->hash->wait.lock); + list_del_init(&wq->wait.entry); + spin_unlock_irq(&wq->hash->wait.lock); + put_task_struct(wq->task); wq->task = NULL; } static void io_wq_destroy(struct io_wq *wq) { - int node; + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_all, + .cancel_all = true, + }; cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_all, - .cancel_all = true, - }; - io_wqe_cancel_pending_work(wqe, &match); - free_cpumask_var(wqe->cpu_mask); - kfree(wqe); - } + io_wq_cancel_pending_work(wq, &match); + free_cpumask_var(wq->cpu_mask); io_wq_put_hash(wq->hash); kfree(wq); } @@ -1311,9 +1250,9 @@ static bool io_wq_worker_affinity(struct io_worker *worker, void *data) struct online_data *od = data; if (od->online) - cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask); + cpumask_set_cpu(od->cpu, worker->wq->cpu_mask); else - cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask); + cpumask_clear_cpu(od->cpu, worker->wq->cpu_mask); return false; } @@ -1323,11 +1262,9 @@ static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online) .cpu = cpu, .online = online }; - int i; rcu_read_lock(); - for_each_node(i) - io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od); + io_wq_for_each_worker(wq, io_wq_worker_affinity, &od); rcu_read_unlock(); return 0; } @@ -1348,18 +1285,13 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) { - int i; - rcu_read_lock(); - for_each_node(i) { - struct io_wqe *wqe = wq->wqes[i]; - - if (mask) - cpumask_copy(wqe->cpu_mask, mask); - else - cpumask_copy(wqe->cpu_mask, cpumask_of_node(i)); - } + if (mask) + cpumask_copy(wq->cpu_mask, mask); + else + cpumask_copy(wq->cpu_mask, cpu_possible_mask); rcu_read_unlock(); + return 0; } @@ -1369,9 +1301,9 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) */ int io_wq_max_workers(struct io_wq *wq, int *new_count) { + struct io_wq_acct *acct; int prev[IO_WQ_ACCT_NR]; - bool first_node = true; - int i, node; + int i; BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND); BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); @@ -1386,21 +1318,15 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) prev[i] = 0; rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_wqe_acct *acct; - - raw_spin_lock(&wqe->lock); - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wqe->acct[i]; - if (first_node) - prev[i] = max_t(int, acct->max_workers, prev[i]); - if (new_count[i]) - acct->max_workers = new_count[i]; - } - raw_spin_unlock(&wqe->lock); - first_node = false; + + raw_spin_lock(&wq->lock); + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + acct = &wq->acct[i]; + prev[i] = max_t(int, acct->max_workers, prev[i]); + if (new_count[i]) + acct->max_workers = new_count[i]; } + raw_spin_unlock(&wq->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2a8b8c304d2a..53f9c246e20f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -72,6 +72,7 @@ #include <linux/io_uring.h> #include <linux/audit.h> #include <linux/security.h> +#include <asm/shmparam.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -246,12 +247,12 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - bool locked = true; + struct io_tw_state ts = { .locked = true, }; mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &locked); - if (WARN_ON_ONCE(!locked)) + req->io_task_work.func(req, &ts); + if (WARN_ON_ONCE(!ts.locked)) return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); @@ -309,8 +310,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - io_alloc_cache_init(&ctx->apoll_cache); - io_alloc_cache_init(&ctx->netmsg_cache); + io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + sizeof(struct io_rsrc_node)); + io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct async_poll)); + io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -324,11 +329,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); - spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); - INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); - init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); - init_llist_head(&ctx->rsrc_put_llist); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; @@ -425,7 +426,13 @@ static void io_prep_async_work(struct io_kiocb *req) req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) + bool should_hash = def->hash_reg_file; + + /* don't serialize this request if the fs doesn't need it */ + if (should_hash && (req->file->f_flags & O_DIRECT) && + (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) + should_hash = false; + if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) @@ -450,7 +457,7 @@ static void io_prep_async_link(struct io_kiocb *req) } } -void io_queue_iowq(struct io_kiocb *req, bool *dont_use) +void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -620,22 +627,22 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } -static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) +static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); - __io_cq_unlock(ctx); - io_commit_cqring_flush(ctx); - /* - * As ->task_complete implies that the ring is single tasked, cq_wait - * may only be waited on by the current in io_cqring_wait(), but since - * it will re-check the wakeup conditions once we return we can safely - * skip waking it up. - */ - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { - smp_mb(); - __io_cqring_wake(ctx); + if (ctx->task_complete) { + /* + * ->task_complete implies that only current might be waiting + * for CQEs, and obviously, we currently don't. No one is + * waiting, wakeups are futile, skip them. + */ + io_commit_cqring_flush(ctx); + } else { + __io_cq_unlock(ctx); + io_commit_cqring_flush(ctx); + io_cqring_wake(ctx); } } @@ -960,9 +967,10 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 return true; } -static void __io_req_complete_post(struct io_kiocb *req) +static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *rsrc_node = NULL; io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) @@ -983,7 +991,7 @@ static void __io_req_complete_post(struct io_kiocb *req) } io_put_kbuf_comp(req); io_dismantle_req(req); - io_req_put_rsrc(req); + rsrc_node = req->rsrc_node; /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid @@ -994,6 +1002,12 @@ static void __io_req_complete_post(struct io_kiocb *req) ctx->locked_free_nr++; } io_cq_unlock_post(ctx); + + if (rsrc_node) { + io_ring_submit_lock(ctx, issue_flags); + io_put_rsrc_node(ctx, rsrc_node); + io_ring_submit_unlock(ctx, issue_flags); + } } void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) @@ -1003,12 +1017,12 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || !(req->ctx->flags & IORING_SETUP_IOPOLL)) { - __io_req_complete_post(req); + __io_req_complete_post(req, issue_flags); } else { struct io_ring_ctx *ctx = req->ctx; mutex_lock(&ctx->uring_lock); - __io_req_complete_post(req); + __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); mutex_unlock(&ctx->uring_lock); } } @@ -1106,11 +1120,14 @@ static inline void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); } -__cold void io_free_req(struct io_kiocb *req) +static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; - io_req_put_rsrc(req); + if (req->rsrc_node) { + io_tw_lock(ctx, ts); + io_put_rsrc_node(ctx, req->rsrc_node); + } io_dismantle_req(req); io_put_task_remote(req->task, 1); @@ -1120,6 +1137,12 @@ __cold void io_free_req(struct io_kiocb *req) spin_unlock(&ctx->completion_lock); } +__cold void io_free_req(struct io_kiocb *req) +{ + req->io_task_work.func = io_free_req_tw; + io_req_task_work_add(req); +} + static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1146,22 +1169,23 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) { if (!ctx) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (*locked) { + if (ts->locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); - *locked = false; + ts->locked = false; } percpu_ref_put(&ctx->refs); } static unsigned int handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked, + struct io_ring_ctx **ctx, + struct io_tw_state *ts, struct llist_node *last) { unsigned int count = 0; @@ -1174,18 +1198,17 @@ static unsigned int handle_tw_list(struct llist_node *node, prefetch(container_of(next, struct io_kiocb, io_task_work.node)); if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, locked); + ctx_flush_and_put(*ctx, ts); *ctx = req->ctx; /* if not contended, grab and improve batching */ - *locked = mutex_trylock(&(*ctx)->uring_lock); + ts->locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); - } else if (!*locked) - *locked = mutex_trylock(&(*ctx)->uring_lock); - req->io_task_work.func(req, locked); + } + req->io_task_work.func(req, ts); node = next; count++; if (unlikely(need_resched())) { - ctx_flush_and_put(*ctx, locked); + ctx_flush_and_put(*ctx, ts); *ctx = NULL; cond_resched(); } @@ -1226,7 +1249,7 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, void tctx_task_work(struct callback_head *cb) { - bool uring_locked = false; + struct io_tw_state ts = {}; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); @@ -1243,12 +1266,12 @@ void tctx_task_work(struct callback_head *cb) do { loops++; node = io_llist_xchg(&tctx->task_list, &fake); - count += handle_tw_list(node, &ctx, &uring_locked, &fake); + count += handle_tw_list(node, &ctx, &ts, &fake); /* skip expensive cmpxchg if there are items in the list */ if (READ_ONCE(tctx->task_list.first) != &fake) continue; - if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { io_submit_flush_completions(ctx); if (READ_ONCE(tctx->task_list.first) != &fake) continue; @@ -1256,7 +1279,7 @@ void tctx_task_work(struct callback_head *cb) node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } while (node != &fake); - ctx_flush_and_put(ctx, &uring_locked); + ctx_flush_and_put(ctx, &ts); /* relaxed read is enough as only the task itself sets ->in_cancel */ if (unlikely(atomic_read(&tctx->in_cancel))) @@ -1279,42 +1302,67 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx) } } -static void io_req_local_work_add(struct io_kiocb *req) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; + unsigned nr_wait, nr_tw, nr_tw_prev; + struct llist_node *first; - percpu_ref_get(&ctx->refs); - - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) - goto put_ref; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + flags &= ~IOU_F_TWQ_LAZY_WAKE; - /* needed for the following wake up */ - smp_mb__after_atomic(); - - if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { - io_move_task_work_from_local(ctx); - goto put_ref; + first = READ_ONCE(ctx->work_llist.first); + do { + nr_tw_prev = 0; + if (first) { + struct io_kiocb *first_req = container_of(first, + struct io_kiocb, + io_task_work.node); + /* + * Might be executed at any moment, rely on + * SLAB_TYPESAFE_BY_RCU to keep it alive. + */ + nr_tw_prev = READ_ONCE(first_req->nr_tw); + } + nr_tw = nr_tw_prev + 1; + /* Large enough to fail the nr_wait comparison below */ + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) + nr_tw = -1U; + + req->nr_tw = nr_tw; + req->io_task_work.node.next = first; + } while (!try_cmpxchg(&ctx->work_llist.first, &first, + &req->io_task_work.node)); + + if (!first) { + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + if (ctx->has_evfd) + io_eventfd_signal(ctx); } - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) - io_eventfd_signal(ctx); - - if (READ_ONCE(ctx->cq_waiting)) - wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); - -put_ref: - percpu_ref_put(&ctx->refs); + nr_wait = atomic_read(&ctx->cq_wait_nr); + /* no one is waiting */ + if (!nr_wait) + return; + /* either not enough or the previous add has already woken it up */ + if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) + return; + /* pairs with set_current_state() in io_cqring_wait() */ + smp_mb__after_atomic(); + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } -void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - io_req_local_work_add(req); + if (!(flags & IOU_F_TWQ_FORCE_NORMAL) && + (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { + rcu_read_lock(); + io_req_local_work_add(req, flags); + rcu_read_unlock(); return; } @@ -1341,11 +1389,11 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) io_task_work.node); node = node->next; - __io_req_task_work_add(req, false); + __io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL); } } -static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) +static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) { struct llist_node *node; unsigned int loops = 0; @@ -1362,7 +1410,7 @@ again: struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, locked); + req->io_task_work.func(req, ts); ret++; node = next; } @@ -1370,7 +1418,7 @@ again: if (!llist_empty(&ctx->work_llist)) goto again; - if (*locked) { + if (ts->locked) { io_submit_flush_completions(ctx); if (!llist_empty(&ctx->work_llist)) goto again; @@ -1381,46 +1429,46 @@ again: static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { - bool locked; + struct io_tw_state ts = { .locked = true, }; int ret; if (llist_empty(&ctx->work_llist)) return 0; - locked = true; - ret = __io_run_local_work(ctx, &locked); + ret = __io_run_local_work(ctx, &ts); /* shouldn't happen! */ - if (WARN_ON_ONCE(!locked)) + if (WARN_ON_ONCE(!ts.locked)) mutex_lock(&ctx->uring_lock); return ret; } static int io_run_local_work(struct io_ring_ctx *ctx) { - bool locked = mutex_trylock(&ctx->uring_lock); + struct io_tw_state ts = {}; int ret; - ret = __io_run_local_work(ctx, &locked); - if (locked) + ts.locked = mutex_trylock(&ctx->uring_lock); + ret = __io_run_local_work(ctx, &ts); + if (ts.locked) mutex_unlock(&ctx->uring_lock); return ret; } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, bool *locked) +void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) - io_queue_iowq(req, locked); + io_queue_iowq(req, ts); else io_queue_sqe(req); } @@ -1646,9 +1694,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return ret; } -void io_req_task_complete(struct io_kiocb *req, bool *locked) +void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { - if (*locked) + if (ts->locked) io_req_complete_defer(req); else io_req_complete_post(req, IO_URING_F_UNLOCKED); @@ -1927,9 +1975,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } -int io_poll_issue(struct io_kiocb *req, bool *locked) +int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } @@ -2298,8 +2346,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); - /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(req, true); + trace_io_uring_submit_req(req); /* * If we already have a head request, queue this one for async @@ -2428,7 +2475,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) if (unlikely(!entries)) return 0; /* make sure SQ entry isn't read before tail */ - ret = left = min3(nr, ctx->sq_entries, entries); + ret = left = min(nr, entries); io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); @@ -2600,7 +2647,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, unsigned long check_cq; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - WRITE_ONCE(ctx->cq_waiting, 1); + int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); + + atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, @@ -2609,7 +2658,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = io_cqring_wait_schedule(ctx, &iowq); __set_current_state(TASK_RUNNING); - WRITE_ONCE(ctx->cq_waiting, 0); + atomic_set(&ctx->cq_wait_nr, 0); if (ret < 0) break; @@ -2772,10 +2821,14 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +static void io_rsrc_node_cache_free(struct io_cache_entry *entry) +{ + kfree(container_of(entry, struct io_rsrc_node, cache)); +} + static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); - io_rsrc_refs_drop(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ io_wait_rsrc_data(ctx->buf_data); io_wait_rsrc_data(ctx->file_data); @@ -2798,14 +2851,11 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx->rsrc_node); + io_rsrc_node_destroy(ctx, ctx->rsrc_node); if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx->rsrc_backup_node); - flush_delayed_work(&ctx->rsrc_put_work); - flush_delayed_work(&ctx->fallback_work); + io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { @@ -2815,6 +2865,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); + io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; @@ -3031,6 +3082,10 @@ static __cold void io_ring_exit_work(struct work_struct *work) spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); + /* pairs with RCU read section in io_req_local_work_add() */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + synchronize_rcu(); + io_ring_ctx_free(ctx); } @@ -3146,6 +3201,12 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, enum io_wq_cancel cret; bool ret = false; + /* set it so io_req_local_work_add() would wake us up */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + /* failed during ring init, it couldn't have issued any requests */ if (!ctx->rings) return false; @@ -3200,6 +3261,8 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; + struct io_tctx_node *node; + unsigned long index; s64 inflight; DEFINE_WAIT(wait); @@ -3221,9 +3284,6 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) break; if (!sqd) { - struct io_tctx_node *node; - unsigned long index; - xa_for_each(&tctx->xa, index, node) { /* sqpoll task will cancel all its requests */ if (node->ctx->sq_data) @@ -3246,7 +3306,13 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); io_run_task_work(); io_uring_drop_tctx_refs(current); - + xa_for_each(&tctx->xa, index, node) { + if (!llist_empty(&node->ctx->work_llist)) { + WARN_ON_ONCE(node->ctx->submitter_task && + node->ctx->submitter_task != current); + goto end_wait; + } + } /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did @@ -3254,6 +3320,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) */ if (inflight == tctx_inflight(tctx, !cancel_all)) schedule(); +end_wait: finish_wait(&tctx->wait, &wait); } while (1); @@ -3282,7 +3349,7 @@ static void *io_uring_validate_mmap_request(struct file *file, struct page *page; void *ptr; - switch (offset) { + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: ptr = ctx->rings; @@ -3290,6 +3357,17 @@ static void *io_uring_validate_mmap_request(struct file *file, case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; + case IORING_OFF_PBUF_RING: { + unsigned int bgid; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + mutex_lock(&ctx->uring_lock); + ptr = io_pbuf_get_address(ctx, bgid); + mutex_unlock(&ctx->uring_lock); + if (!ptr) + return ERR_PTR(-EINVAL); + break; + } default: return ERR_PTR(-EINVAL); } @@ -3317,6 +3395,54 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); + struct vm_unmapped_area_info info; + void *ptr; + + /* + * Do not allow to map to user-provided address to avoid breaking the + * aliasing rules. Userspace is not able to guess the offset address of + * kernel kmalloc()ed memory area. + */ + if (addr) + return -EINVAL; + + ptr = io_uring_validate_mmap_request(filp, pgoff, len); + if (IS_ERR(ptr)) + return -ENOMEM; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); +#ifdef SHM_COLOUR + info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL); +#else + info.align_mask = PAGE_MASK & (SHMLBA - 1UL); +#endif + info.align_offset = (unsigned long) ptr; + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + addr = vm_unmapped_area(&info); + if (offset_in_page(addr)) { + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = mmap_end; + addr = vm_unmapped_area(&info); + } + + return addr; +} + #else /* !CONFIG_MMU */ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) @@ -3529,6 +3655,8 @@ static const struct file_operations io_uring_fops = { #ifndef CONFIG_MMU .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, +#else + .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS @@ -4425,7 +4553,7 @@ static int __init io_uring_init(void) io_uring_optable_init(); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT); + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); return 0; }; __initcall(io_uring_init); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2711865f1e19..ef449e43d493 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -16,6 +16,20 @@ #endif enum { + /* don't use deferred task_work */ + IOU_F_TWQ_FORCE_NORMAL = 1, + + /* + * A hint to not wake right away but delay until there are enough of + * tw's queued to match the number of CQEs the task is waiting for. + * + * Must not be used wirh requests generating more than one CQE. + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. + */ + IOU_F_TWQ_LAZY_WAKE = 2, +}; + +enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, @@ -48,20 +62,20 @@ static inline bool io_req_ffs_set(struct io_kiocb *req) return req->flags & REQ_F_FIXED_FILE; } -void __io_req_task_work_add(struct io_kiocb *req, bool allow_local); +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, bool *dont_use); -void io_req_task_complete(struct io_kiocb *req, bool *locked); +void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); +void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, bool *locked); +void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); -int io_poll_issue(struct io_kiocb *req, bool *locked); +int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); @@ -93,7 +107,7 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, static inline void io_req_task_work_add(struct io_kiocb *req) { - __io_req_task_work_add(req, true); + __io_req_task_work_add(req, 0); } #define io_for_each_link(pos, head) \ @@ -228,8 +242,7 @@ static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } -/* requires smb_mb() prior, see wq_has_sleeper() */ -static inline void __io_cqring_wake(struct io_ring_ctx *ctx) +static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * Trigger waitqueue handler on all waiters on our waitqueue. This @@ -241,17 +254,11 @@ static inline void __io_cqring_wake(struct io_ring_ctx *ctx) * waitqueue handlers, we know we have a dependency between eventfd or * epoll and should terminate multishot poll at that point. */ - if (waitqueue_active(&ctx->cq_wait)) + if (wq_has_sleeper(&ctx->cq_wait)) __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } -static inline void io_cqring_wake(struct io_ring_ctx *ctx) -{ - smp_mb(); - __io_cqring_wake(ctx); -} - static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; @@ -262,9 +269,11 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; + unsigned int entries; /* make sure SQ entry isn't read before tail */ - return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; + entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; + return min(entries, ctx->sq_entries); } static inline int io_run_task_work(void) @@ -299,11 +308,11 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); } -static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { - if (!*locked) { + if (!ts->locked) { mutex_lock(&ctx->uring_lock); - *locked = true; + ts->locked = true; } } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index a90c820ce99e..748b6235c6c7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, return NULL; head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + /* mmaped buffers are always contig */ + if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { buf = &br->bufs[head]; } else { int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); @@ -179,7 +180,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { - if (bl->buf_nr_pages) + if (bl->is_mapped) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -214,17 +215,30 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->buf_nr_pages) { - int j; - + if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (bl->is_mmap) { + if (bl->buf_ring) { + struct page *page; + + page = virt_to_head_page(bl->buf_ring); + if (put_page_testzero(page)) + free_compound_page(page); + bl->buf_ring = NULL; + } + bl->is_mmap = 0; + } else if (bl->buf_nr_pages) { + int j; + + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); + bl->is_mapped = 0; return i; } @@ -304,7 +318,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (bl) { ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->buf_nr_pages) + if (!bl->is_mapped) ret = __io_remove_buffers(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); @@ -449,7 +463,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) } } /* can't add buffers via this command for a mapped buffer ring */ - if (bl->buf_nr_pages) { + if (bl->is_mapped) { ret = -EINVAL; goto err; } @@ -464,23 +478,87 @@ err: return IOU_OK; } -int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) { struct io_uring_buf_ring *br; - struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; struct page **pages; int nr_pages; + pages = io_pin_pages(reg->ring_addr, + flex_array_size(br, bufs, reg->ring_entries), + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + br = page_address(pages[0]); +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { + int i; + + for (i = 0; i < nr_pages; i++) + unpin_user_page(pages[i]); + return -EINVAL; + } +#endif + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->buf_ring = br; + bl->is_mapped = 1; + bl->is_mmap = 0; + return 0; +} + +static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + size_t ring_size; + void *ptr; + + ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); + ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); + if (!ptr) + return -ENOMEM; + + bl->buf_ring = ptr; + bl->is_mapped = 1; + bl->is_mmap = 1; + return 0; +} + +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl, *free_bl = NULL; + int ret; + if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) + if (reg.flags & ~IOU_PBUF_RING_MMAP) return -EINVAL; + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + } else { + if (reg.ring_addr) + return -EINVAL; + } + if (!is_power_of_2(reg.ring_entries)) return -EINVAL; @@ -497,7 +575,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + if (bl->is_mapped || !list_empty(&bl->buf_list)) return -EEXIST; } else { free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); @@ -505,22 +583,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - pages = io_pin_pages(reg.ring_addr, - flex_array_size(br, bufs, reg.ring_entries), - &nr_pages); - if (IS_ERR(pages)) { - kfree(free_bl); - return PTR_ERR(pages); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) + ret = io_pin_pbuf_ring(®, bl); + else + ret = io_alloc_pbuf_ring(®, bl); + + if (!ret) { + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; } - br = page_address(pages[0]); - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->nr_entries = reg.ring_entries; - bl->buf_ring = br; - bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + kfree(free_bl); + return ret; } int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) @@ -530,13 +607,15 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->buf_nr_pages) + if (!bl->is_mapped) return -EINVAL; __io_remove_buffers(ctx, bl, -1U); @@ -546,3 +625,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } return 0; } + +void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) +{ + struct io_buffer_list *bl; + + bl = io_buffer_get_list(ctx, bgid); + if (!bl || !bl->is_mmap) + return NULL; + + return bl->buf_ring; +} diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index c23e15d7d3ca..d14345ef61fc 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -23,6 +23,11 @@ struct io_buffer_list { __u16 nr_entries; __u16 head; __u16 mask; + + /* ring mapped provided buffers */ + __u8 is_mapped; + /* ring mapped provided buffers, but mmap'ed by application */ + __u8 is_mmap; }; struct io_buffer { @@ -50,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid); + static inline void io_kbuf_recycle_ring(struct io_kiocb *req) { /* diff --git a/io_uring/net.c b/io_uring/net.c index 4040cf093318..89e839013837 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -184,8 +184,8 @@ static int io_setup_async_msg(struct io_kiocb *req, async_msg->msg.msg_name = &async_msg->addr; /* if were using fast_iov, set it to the new one */ if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { - size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; - async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; + size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; + async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; } return -EAGAIN; diff --git a/io_uring/net.h b/io_uring/net.h index 5ffa11bf5d2e..191009979bcb 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -5,8 +5,8 @@ #include "alloc_cache.h" -#if defined(CONFIG_NET) struct io_async_msghdr { +#if defined(CONFIG_NET) union { struct iovec fast_iov[UIO_FASTIOV]; struct { @@ -22,8 +22,11 @@ struct io_async_msghdr { struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; +#endif }; +#if defined(CONFIG_NET) + struct io_async_connect { struct sockaddr_storage address; }; diff --git a/io_uring/notif.c b/io_uring/notif.c index 09dfd0832d19..e1846a25dde1 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,7 +9,7 @@ #include "notif.h" #include "rsrc.h" -static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) +static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; @@ -21,7 +21,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; } - io_req_task_complete(notif, locked); + io_req_task_complete(notif, ts); } static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, @@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, struct io_kiocb *notif = cmd_to_io_kiocb(nd); if (refcount_dec_and_test(&uarg->refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, diff --git a/io_uring/notif.h b/io_uring/notif.h index c88c800cd89d..6dd1b30a468f 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -33,7 +33,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) /* drop slot's master ref */ if (refcount_dec_and_test(&nd->uarg.refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) diff --git a/io_uring/poll.c b/io_uring/poll.c index 55306e801081..c90e47dc1e29 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -148,7 +148,7 @@ static void io_poll_req_insert_locked(struct io_kiocb *req) hlist_add_head(&req->hash_node, &table->hbs[index].list); } -static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) +static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; @@ -159,7 +159,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) * already grabbed the mutex for us, but there is a chance it * failed. */ - io_tw_lock(ctx, locked); + io_tw_lock(ctx, ts); hash_del(&req->hash_node); req->flags &= ~REQ_F_HASH_LOCKED; } else { @@ -238,7 +238,7 @@ enum { * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, bool *locked) +static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) { int v; @@ -300,13 +300,13 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data, + if (!io_aux_cqe(req->ctx, ts->locked, req->cqe.user_data, mask, IORING_CQE_F_MORE, false)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, locked); + int ret = io_poll_issue(req, ts); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; if (ret < 0) @@ -326,15 +326,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return IOU_POLL_NO_ACTION; } -static void io_poll_task_func(struct io_kiocb *req, bool *locked) +static void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) { int ret; - ret = io_poll_check_events(req, locked); + ret = io_poll_check_events(req, ts); if (ret == IOU_POLL_NO_ACTION) return; io_poll_remove_entries(req); - io_poll_tw_hash_eject(req, locked); + io_poll_tw_hash_eject(req, ts); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { @@ -343,7 +343,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, locked); + io_req_task_submit(req, ts); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -351,14 +351,14 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } else { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, locked); + io_req_task_submit(req, ts); else io_req_defer_failed(req, ret); } @@ -977,7 +977,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - bool locked; + struct io_tw_state ts = {}; preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); ret2 = io_poll_disarm(preq); @@ -1027,8 +1027,8 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - locked = !(issue_flags & IO_URING_F_UNLOCKED); - io_req_task_complete(preq, &locked); + ts.locked = !(issue_flags & IO_URING_F_UNLOCKED); + io_req_task_complete(preq, &ts); out: if (ret < 0) { req_set_fail(req); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a5ed0ee7c160..74e13230fa0c 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -27,19 +27,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); -#define IO_RSRC_REF_BATCH 100 - /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -void io_rsrc_refs_drop(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) +static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data) { - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; - } + return !--rsrc_data->refs; } int __io_account_mem(struct user_struct *user, unsigned long nr_pages) @@ -151,132 +145,84 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } -void io_rsrc_refs_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) +static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, + struct io_rsrc_put *prsrc) { - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); + struct io_ring_ctx *ctx = rsrc_data->ctx; + + if (prsrc->tag) + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + rsrc_data->do_put(ctx, prsrc); } static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; struct io_rsrc_put *prsrc, *tmp; - list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { - list_del(&prsrc->list); - - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - mutex_unlock(&ctx->uring_lock); - } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - } - } + if (ref_node->inline_items) + io_rsrc_put_work_one(rsrc_data, &ref_node->item); - rsrc_data->do_put(ctx, prsrc); + list_for_each_entry_safe(prsrc, tmp, &ref_node->item_list, list) { + list_del(&prsrc->list); + io_rsrc_put_work_one(rsrc_data, prsrc); kfree(prsrc); } - io_rsrc_node_destroy(ref_node); - if (atomic_dec_and_test(&rsrc_data->refs)) + io_rsrc_node_destroy(rsrc_data->ctx, ref_node); + if (io_put_rsrc_data_ref(rsrc_data)) complete(&rsrc_data->done); } -void io_rsrc_put_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); - - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; - - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; - } -} - -void io_rsrc_put_tw(struct callback_head *cb) -{ - struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, - rsrc_put_tw); - - io_rsrc_put_work(&ctx->rsrc_put_work.work); -} - void io_wait_rsrc_data(struct io_rsrc_data *data) { - if (data && !atomic_dec_and_test(&data->refs)) + if (data && !io_put_rsrc_data_ref(data)) wait_for_completion(&data->done); } -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + kfree(node); } -static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) +void io_rsrc_node_ref_zero(struct io_rsrc_node *node) + __must_hold(&node->rsrc_data->ctx->uring_lock) { - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - unsigned long delay = HZ; - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; - while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ if (!node->done) break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - - if (!first_add) - return; - if (ctx->submitter_task) { - if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw, - ctx->notify_method)) - return; + list_del(&node->node); + __io_rsrc_put_work(node); } - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } -static struct io_rsrc_node *io_rsrc_node_alloc(void) +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; + struct io_cache_entry *entry; - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; + entry = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (entry) { + ref_node = container_of(entry, struct io_rsrc_node, cache); + } else { + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; } + + ref_node->refs = 1; INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); + INIT_LIST_HEAD(&ref_node->item_list); ref_node->done = false; + ref_node->inline_items = 0; return ref_node; } @@ -287,18 +233,15 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, WARN_ON_ONCE(!ctx->rsrc_backup_node); WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); - io_rsrc_refs_drop(ctx); - if (data_to_kill) { struct io_rsrc_node *rsrc_node = ctx->rsrc_node; rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); - atomic_inc(&data_to_kill->refs); - percpu_ref_kill(&rsrc_node->refs); + data_to_kill->refs++; + /* put master ref */ + io_put_rsrc_node(ctx, rsrc_node); ctx->rsrc_node = NULL; } @@ -312,7 +255,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { if (ctx->rsrc_backup_node) return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); + ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); return ctx->rsrc_backup_node ? 0 : -ENOMEM; } @@ -329,8 +272,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, return ret; io_rsrc_node_switch(ctx, data); - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) + /* kill initial ref */ + if (io_put_rsrc_data_ref(data)) return 0; data->quiesce = true; @@ -338,19 +281,19 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, do { ret = io_run_task_work_sig(ctx); if (ret < 0) { - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); - reinit_completion(&data->done); mutex_lock(&ctx->uring_lock); + if (!data->refs) { + ret = 0; + } else { + /* restore the master reference */ + data->refs++; + } break; } - - flush_delayed_work(&ctx->rsrc_put_work); ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) <= 0) + if (!data->refs) break; /* * it has been revived by another thread while @@ -425,6 +368,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, data->nr = nr; data->ctx = ctx; data->do_put = do_put; + data->refs = 1; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { @@ -435,8 +379,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, goto fail; } } - - atomic_set(&data->refs, 1); init_completion(&data->done); *pdata = data; return 0; @@ -758,15 +700,23 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, { u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; + bool inline_item = true; - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) - return -ENOMEM; + if (!node->inline_items) { + prsrc = &node->item; + node->inline_items++; + } else { + prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); + if (!prsrc) + return -ENOMEM; + inline_item = false; + } prsrc->tag = *tag_slot; *tag_slot = 0; prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); + if (!inline_item) + list_add(&prsrc->list, &node->item_list); return 0; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f27f4975217d..8729f2fee256 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -4,6 +4,10 @@ #include <net/af_unix.h> +#include "alloc_cache.h" + +#define IO_NODE_ALLOC_CACHE_MAX 32 + #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) @@ -31,18 +35,29 @@ struct io_rsrc_data { u64 **tags; unsigned int nr; rsrc_put_fn *do_put; - atomic_t refs; struct completion done; + int refs; bool quiesce; }; struct io_rsrc_node { - struct percpu_ref refs; + union { + struct io_cache_entry cache; + struct io_rsrc_data *rsrc_data; + }; struct list_head node; - struct list_head rsrc_list; - struct io_rsrc_data *rsrc_data; struct llist_node llist; + int refs; bool done; + + /* + * Keeps a list of struct io_rsrc_put to be completed. Each entry + * represents one rsrc (e.g. file or buffer), but all of them should've + * came from the same table and so are of the same type. + */ + struct list_head item_list; + struct io_rsrc_put item; + int inline_items; }; struct io_mapped_ubuf { @@ -54,11 +69,10 @@ struct io_mapped_ubuf { }; void io_rsrc_put_tw(struct callback_head *cb); +void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); -void io_rsrc_refs_refill(struct io_ring_ctx *ctx); void io_wait_rsrc_data(struct io_rsrc_data *data); -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); -void io_rsrc_refs_drop(struct io_ring_ctx *ctx); +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc); @@ -107,36 +121,24 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - percpu_ref_put_many(&node->refs, nr); -} + lockdep_assert_held(&ctx->uring_lock); -static inline void io_req_put_rsrc(struct io_kiocb *req) -{ - if (req->rsrc_node) - io_rsrc_put_node(req->rsrc_node, 1); + if (node && !--node->refs) + io_rsrc_node_ref_zero(node); } static inline void io_req_put_rsrc_locked(struct io_kiocb *req, struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) { - struct io_rsrc_node *node = req->rsrc_node; - - if (node) { - if (node == ctx->rsrc_node) - ctx->rsrc_cached_refs++; - else - io_rsrc_put_node(node, 1); - } + io_put_rsrc_node(ctx, req->rsrc_node); } -static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx) +static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, + struct io_rsrc_node *node) { - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); + node->refs++; } static inline void io_req_set_rsrc_node(struct io_kiocb *req, @@ -149,7 +151,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, lockdep_assert_held(&ctx->uring_lock); req->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx); + io_charge_rsrc_node(ctx, ctx->rsrc_node); io_ring_submit_unlock(ctx, issue_flags); } } diff --git a/io_uring/rw.c b/io_uring/rw.c index 4c233910e200..3f118ed46e4f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -283,16 +283,16 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -static void io_req_rw_complete(struct io_kiocb *req, bool *locked) +static void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) { io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; req->cqe.flags |= io_put_kbuf(req, issue_flags); } - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -304,7 +304,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; io_req_set_res(req, io_fixup_rw_res(req, res), 0); req->io_task_work.func = io_req_rw_complete; - io_req_task_work_add(req); + __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -447,26 +447,25 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) ppos = io_kiocb_ppos(kiocb); while (iov_iter_count(iter)) { - struct iovec iovec; + void __user *addr; + size_t len; ssize_t nr; if (iter_is_ubuf(iter)) { - iovec.iov_base = iter->ubuf + iter->iov_offset; - iovec.iov_len = iov_iter_count(iter); + addr = iter->ubuf + iter->iov_offset; + len = iov_iter_count(iter); } else if (!iov_iter_is_bvec(iter)) { - iovec = iov_iter_iovec(iter); + addr = iter_iov_addr(iter); + len = iter_iov_len(iter); } else { - iovec.iov_base = u64_to_user_ptr(rw->addr); - iovec.iov_len = rw->len; + addr = u64_to_user_ptr(rw->addr); + len = rw->len; } - if (ddir == READ) { - nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, ppos); - } else { - nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, ppos); - } + if (ddir == READ) + nr = file->f_op->read(file, addr, len, ppos); + else + nr = file->f_op->write(file, addr, len, ppos); if (nr < 0) { if (!ret) @@ -482,7 +481,7 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if (!rw->len) break; } - if (nr != iovec.iov_len) + if (nr != len) break; } @@ -503,10 +502,10 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, if (!iovec) { unsigned iov_off = 0; - io->s.iter.iov = io->s.fast_iov; - if (iter->iov != fast_iov) { - iov_off = iter->iov - fast_iov; - io->s.iter.iov += iov_off; + io->s.iter.__iov = io->s.fast_iov; + if (iter->__iov != fast_iov) { + iov_off = iter_iov(iter) - fast_iov; + io->s.iter.__iov += iov_off; } if (io->s.fast_iov != fast_iov) memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, @@ -1002,7 +1001,7 @@ void io_rw_fail(struct io_kiocb *req) int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; - unsigned int poll_flags = BLK_POLL_NOSLEEP; + unsigned int poll_flags = 0; DEFINE_IO_COMP_BATCH(iob); int nr_events = 0; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 826a51bca3e4..5c6c6f720809 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -101,9 +101,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) +static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) { - io_tw_lock(link->ctx, locked); + io_tw_lock(link->ctx, ts); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -112,7 +112,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, locked); + io_req_task_complete(link, ts); link = nxt; } } @@ -265,9 +265,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) +static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -282,11 +282,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 9a1dee571872..f7a96bc76ea1 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -12,10 +12,10 @@ #include "rsrc.h" #include "uring_cmd.h" -static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; ioucmd->task_work_cb(ioucmd, issue_flags); } @@ -73,6 +73,7 @@ int io_uring_cmd_prep_async(struct io_kiocb *req) cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); memcpy(req->async_data, ioucmd->cmd, cmd_size); + ioucmd->cmd = req->async_data; return 0; } @@ -129,9 +130,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) WRITE_ONCE(ioucmd->cookie, NULL); } - if (req_has_async_data(req)) - ioucmd->cmd = req->async_data; - ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { if (!req_has_async_data(req)) { diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 34dd6bdf2fba..6d0203a3c00a 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -14,8 +14,6 @@ #include <linux/scatterlist.h> #include <linux/instrumented.h> -#define PIPE_PARANOIA /* for now */ - /* covers ubuf and kbuf alike */ #define iterate_buf(i, n, base, len, off, __p, STEP) { \ size_t __maybe_unused off = 0; \ @@ -126,13 +124,13 @@ __out: \ iterate_buf(i, n, base, len, off, \ i->ubuf, (I)) \ } else if (likely(iter_is_iovec(i))) { \ - const struct iovec *iov = i->iov; \ + const struct iovec *iov = iter_iov(i); \ void __user *base; \ size_t len; \ iterate_iovec(i, n, base, len, off, \ iov, (I)) \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ + i->nr_segs -= iov - iter_iov(i); \ + i->__iov = iov; \ } else if (iov_iter_is_bvec(i)) { \ const struct bio_vec *bvec = i->bvec; \ void *base; \ @@ -198,150 +196,6 @@ static int copyin(void *to, const void __user *from, size_t n) return res; } -#ifdef PIPE_PARANOIA -static bool sanity(const struct iov_iter *i) -{ - struct pipe_inode_info *pipe = i->pipe; - unsigned int p_head = pipe->head; - unsigned int p_tail = pipe->tail; - unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); - unsigned int i_head = i->head; - unsigned int idx; - - if (i->last_offset) { - struct pipe_buffer *p; - if (unlikely(p_occupancy == 0)) - goto Bad; // pipe must be non-empty - if (unlikely(i_head != p_head - 1)) - goto Bad; // must be at the last buffer... - - p = pipe_buf(pipe, i_head); - if (unlikely(p->offset + p->len != abs(i->last_offset))) - goto Bad; // ... at the end of segment - } else { - if (i_head != p_head) - goto Bad; // must be right after the last buffer - } - return true; -Bad: - printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset); - printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", - p_head, p_tail, pipe->ring_size); - for (idx = 0; idx < pipe->ring_size; idx++) - printk(KERN_ERR "[%p %p %d %d]\n", - pipe->bufs[idx].ops, - pipe->bufs[idx].page, - pipe->bufs[idx].offset, - pipe->bufs[idx].len); - WARN_ON(1); - return false; -} -#else -#define sanity(i) true -#endif - -static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size) -{ - struct page *page = alloc_page(GFP_USER); - if (page) { - struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); - *buf = (struct pipe_buffer) { - .ops = &default_pipe_buf_ops, - .page = page, - .offset = 0, - .len = size - }; - } - return page; -} - -static void push_page(struct pipe_inode_info *pipe, struct page *page, - unsigned int offset, unsigned int size) -{ - struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); - *buf = (struct pipe_buffer) { - .ops = &page_cache_pipe_buf_ops, - .page = page, - .offset = offset, - .len = size - }; - get_page(page); -} - -static inline int last_offset(const struct pipe_buffer *buf) -{ - if (buf->ops == &default_pipe_buf_ops) - return buf->len; // buf->offset is 0 for those - else - return -(buf->offset + buf->len); -} - -static struct page *append_pipe(struct iov_iter *i, size_t size, - unsigned int *off) -{ - struct pipe_inode_info *pipe = i->pipe; - int offset = i->last_offset; - struct pipe_buffer *buf; - struct page *page; - - if (offset > 0 && offset < PAGE_SIZE) { - // some space in the last buffer; add to it - buf = pipe_buf(pipe, pipe->head - 1); - size = min_t(size_t, size, PAGE_SIZE - offset); - buf->len += size; - i->last_offset += size; - i->count -= size; - *off = offset; - return buf->page; - } - // OK, we need a new buffer - *off = 0; - size = min_t(size_t, size, PAGE_SIZE); - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) - return NULL; - page = push_anon(pipe, size); - if (!page) - return NULL; - i->head = pipe->head - 1; - i->last_offset = size; - i->count -= size; - return page; -} - -static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - struct pipe_inode_info *pipe = i->pipe; - unsigned int head = pipe->head; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - if (offset && i->last_offset == -offset) { // could we merge it? - struct pipe_buffer *buf = pipe_buf(pipe, head - 1); - if (buf->page == page) { - buf->len += bytes; - i->last_offset -= bytes; - i->count -= bytes; - return bytes; - } - } - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) - return 0; - - push_page(pipe, page, offset, bytes); - i->last_offset = -(offset + bytes); - i->head = head; - i->count -= bytes; - return bytes; -} - /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator @@ -367,7 +221,7 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) size_t skip; size -= count; - for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; @@ -410,7 +264,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) size_t skip; size -= count; - for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; @@ -437,7 +291,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, .nofault = false, .user_backed = true, .data_source = direction, - .iov = iov, + .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count @@ -445,46 +299,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_init); -// returns the offset in partial buffer (if any) -static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages) -{ - struct pipe_inode_info *pipe = i->pipe; - int used = pipe->head - pipe->tail; - int off = i->last_offset; - - *npages = max((int)pipe->max_usage - used, 0); - - if (off > 0 && off < PAGE_SIZE) { // anon and not full - (*npages)++; - return off; - } - return 0; -} - -static size_t copy_pipe_to_iter(const void *addr, size_t bytes, - struct iov_iter *i) -{ - unsigned int off, chunk; - - if (unlikely(bytes > i->count)) - bytes = i->count; - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - for (size_t n = bytes; n; n -= chunk) { - struct page *page = append_pipe(i, n, &off); - chunk = min_t(size_t, n, PAGE_SIZE - off); - if (!page) - return bytes - n; - memcpy_to_page(page, off, addr, chunk); - addr += chunk; - } - return bytes; -} - static __wsum csum_and_memcpy(void *to, const void *from, size_t len, __wsum sum, size_t off) { @@ -492,44 +306,10 @@ static __wsum csum_and_memcpy(void *to, const void *from, size_t len, return csum_block_add(sum, next, off); } -static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, - struct iov_iter *i, __wsum *sump) -{ - __wsum sum = *sump; - size_t off = 0; - unsigned int chunk, r; - - if (unlikely(bytes > i->count)) - bytes = i->count; - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - while (bytes) { - struct page *page = append_pipe(i, bytes, &r); - char *p; - - if (!page) - break; - chunk = min_t(size_t, bytes, PAGE_SIZE - r); - p = kmap_local_page(page); - sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); - kunmap_local(p); - off += chunk; - bytes -= chunk; - } - *sump = sum; - return off; -} - size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; - if (unlikely(iov_iter_is_pipe(i))) - return copy_pipe_to_iter(addr, bytes, i); if (user_backed_iter(i)) might_fault(); iterate_and_advance(i, bytes, base, len, off, @@ -551,42 +331,6 @@ static int copyout_mc(void __user *to, const void *from, size_t n) return n; } -static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, - struct iov_iter *i) -{ - size_t xfer = 0; - unsigned int off, chunk; - - if (unlikely(bytes > i->count)) - bytes = i->count; - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - while (bytes) { - struct page *page = append_pipe(i, bytes, &off); - unsigned long rem; - char *p; - - if (!page) - break; - chunk = min_t(size_t, bytes, PAGE_SIZE - off); - p = kmap_local_page(page); - rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); - chunk -= rem; - kunmap_local(p); - xfer += chunk; - bytes -= chunk; - if (rem) { - iov_iter_revert(i, rem); - break; - } - } - return xfer; -} - /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address @@ -606,9 +350,8 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * - * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. - * Compare to copy_to_iter() where only ITER_IOVEC attempts might return - * a short copy. + * * ITER_KVEC and ITER_BVEC can return short copies. Compare to + * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ @@ -616,8 +359,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; - if (unlikely(iov_iter_is_pipe(i))) - return copy_mc_pipe_to_iter(addr, bytes, i); if (user_backed_iter(i)) might_fault(); __iterate_and_advance(i, bytes, base, len, off, @@ -723,8 +464,6 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, return 0; if (WARN_ON_ONCE(i->data_source)) return 0; - if (unlikely(iov_iter_is_pipe(i))) - return copy_page_to_iter_pipe(page, offset, bytes, i); page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { @@ -755,8 +494,6 @@ size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte return 0; if (WARN_ON_ONCE(i->data_source)) return 0; - if (unlikely(iov_iter_is_pipe(i))) - return copy_page_to_iter_pipe(page, offset, bytes, i); page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { @@ -809,36 +546,8 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); -static size_t pipe_zero(size_t bytes, struct iov_iter *i) -{ - unsigned int chunk, off; - - if (unlikely(bytes > i->count)) - bytes = i->count; - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - for (size_t n = bytes; n; n -= chunk) { - struct page *page = append_pipe(i, n, &off); - char *p; - - if (!page) - return bytes - n; - chunk = min_t(size_t, n, PAGE_SIZE - off); - p = kmap_local_page(page); - memset(p + off, 0, chunk); - kunmap_local(p); - } - return bytes; -} - size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (unlikely(iov_iter_is_pipe(i))) - return pipe_zero(bytes, i); iterate_and_advance(i, bytes, base, len, count, clear_user(base, len), memset(base, 0, len) @@ -869,32 +578,6 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt } EXPORT_SYMBOL(copy_page_from_iter_atomic); -static void pipe_advance(struct iov_iter *i, size_t size) -{ - struct pipe_inode_info *pipe = i->pipe; - int off = i->last_offset; - - if (!off && !size) { - pipe_discard_from(pipe, i->start_head); // discard everything - return; - } - i->count -= size; - while (1) { - struct pipe_buffer *buf = pipe_buf(pipe, i->head); - if (off) /* make it relative to the beginning of buffer */ - size += abs(off) - buf->offset; - if (size <= buf->len) { - buf->len = size; - i->last_offset = last_offset(buf); - break; - } - size -= buf->len; - i->head++; - off = 0; - } - pipe_discard_from(pipe, i->head + 1); // discard everything past this one -} - static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; @@ -924,14 +607,14 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) i->count -= size; size += i->iov_offset; // from beginning of current segment - for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { + for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; - i->nr_segs -= iov - i->iov; - i->iov = iov; + i->nr_segs -= iov - iter_iov(i); + i->__iov = iov; } void iov_iter_advance(struct iov_iter *i, size_t size) @@ -946,8 +629,6 @@ void iov_iter_advance(struct iov_iter *i, size_t size) iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); - } else if (iov_iter_is_pipe(i)) { - pipe_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } @@ -961,26 +642,6 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; - if (unlikely(iov_iter_is_pipe(i))) { - struct pipe_inode_info *pipe = i->pipe; - unsigned int head = pipe->head; - - while (head > i->start_head) { - struct pipe_buffer *b = pipe_buf(pipe, --head); - if (unroll < b->len) { - b->len -= unroll; - i->last_offset = last_offset(b); - i->head = head; - return; - } - unroll -= b->len; - pipe_buf_release(pipe, b); - pipe->head--; - } - i->last_offset = 0; - i->head = head; - return; - } if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { @@ -1006,12 +667,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) unroll -= n; } } else { /* same logics for iovec and kvec */ - const struct iovec *iov = i->iov; + const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { - i->iov = iov; + i->__iov = iov; i->iov_offset = n - unroll; return; } @@ -1028,7 +689,7 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) - return min(i->count, i->iov->iov_len - i->iov_offset); + return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } @@ -1068,24 +729,6 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_bvec); -void iov_iter_pipe(struct iov_iter *i, unsigned int direction, - struct pipe_inode_info *pipe, - size_t count) -{ - BUG_ON(direction != READ); - WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); - *i = (struct iov_iter){ - .iter_type = ITER_PIPE, - .data_source = false, - .pipe = pipe, - .head = pipe->head, - .start_head = pipe->head, - .last_offset = 0, - .count = count - }; -} -EXPORT_SYMBOL(iov_iter_pipe); - /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. @@ -1143,13 +786,14 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; - if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) + if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; size -= len; @@ -1210,19 +854,6 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); - if (iov_iter_is_pipe(i)) { - size_t size = i->count; - - if (size & len_mask) - return false; - if (size && i->last_offset > 0) { - if (i->last_offset & addr_mask) - return false; - } - - return true; - } - if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; @@ -1242,9 +873,10 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (len) { - res |= (unsigned long)i->iov[k].iov_base + skip; + res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; @@ -1292,14 +924,6 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); - if (iov_iter_is_pipe(i)) { - size_t size = i->count; - - if (size && i->last_offset > 0) - return size | i->last_offset; - return size; - } - if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; @@ -1321,14 +945,15 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) return ~0U; for (k = 0; k < i->nr_segs; k++) { - if (i->iov[k].iov_len) { - unsigned long base = (unsigned long)i->iov[k].iov_base; + const struct iovec *iov = iter_iov(i) + k; + if (iov->iov_len) { + unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end - v = base + i->iov[k].iov_len; - if (size <= i->iov[k].iov_len) + v = base + iov->iov_len; + if (size <= iov->iov_len) break; - size -= i->iov[k].iov_len; + size -= iov->iov_len; } } return res; @@ -1351,36 +976,6 @@ static int want_pages_array(struct page ***res, size_t size, return count; } -static ssize_t pipe_get_pages(struct iov_iter *i, - struct page ***pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - unsigned int npages, count, off, chunk; - struct page **p; - size_t left; - - if (!sanity(i)) - return -EFAULT; - - *start = off = pipe_npages(i, &npages); - if (!npages) - return -EFAULT; - count = want_pages_array(pages, maxsize, off, min(npages, maxpages)); - if (!count) - return -ENOMEM; - p = *pages; - for (npages = 0, left = maxsize ; npages < count; npages++, left -= chunk) { - struct page *page = append_pipe(i, left, &off); - if (!page) - break; - chunk = min_t(size_t, left, PAGE_SIZE - off); - get_page(*p++ = page); - } - if (!npages) - return -EFAULT; - return maxsize - left; -} - static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { @@ -1444,13 +1039,14 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; - return (unsigned long)i->iov[k].iov_base + skip; + return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } @@ -1473,8 +1069,7 @@ static struct page *first_bvec_segment(const struct iov_iter *i, static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, - unsigned int maxpages, size_t *start, - iov_iter_extraction_t extraction_flags) + unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; @@ -1484,8 +1079,6 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; - if (extraction_flags & ITER_ALLOW_P2PDMA) - gup_flags |= FOLL_PCI_P2PDMA; if (likely(user_backed_iter(i))) { unsigned long addr; @@ -1530,57 +1123,37 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, } return maxsize; } - if (iov_iter_is_pipe(i)) - return pipe_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } -ssize_t iov_iter_get_pages(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start, iov_iter_extraction_t extraction_flags) +ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, + size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); - return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, - start, extraction_flags); -} -EXPORT_SYMBOL_GPL(iov_iter_get_pages); - -ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, - size_t maxsize, unsigned maxpages, size_t *start) -{ - return iov_iter_get_pages(i, pages, maxsize, maxpages, start, 0); + return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); -ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start, iov_iter_extraction_t extraction_flags) +ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, + struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; - len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, - extraction_flags); + len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } -EXPORT_SYMBOL_GPL(iov_iter_get_pages_alloc); - -ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, - struct page ***pages, size_t maxsize, size_t *start) -{ - return iov_iter_get_pages_alloc(i, pages, maxsize, start, 0); -} -EXPORT_SYMBOL(iov_iter_get_pages_alloc2); +EXPORT_SYMBOL_GPL(iov_iter_get_pages_alloc2); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) @@ -1621,9 +1194,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, } sum = csum_shift(csstate->csum, csstate->off); - if (unlikely(iov_iter_is_pipe(i))) - bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); - else iterate_and_advance(i, bytes, base, len, off, ({ + iterate_and_advance(i, bytes, base, len, off, ({ next = csum_and_copy_to_user(addr + off, base, len); sum = csum_block_add(sum, next, off); next ? 0 : len; @@ -1662,7 +1233,7 @@ static int iov_npages(const struct iov_iter *i, int maxpages) const struct iovec *p; int npages = 0; - for (p = i->iov; size; skip = 0, p++) { + for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); @@ -1708,15 +1279,6 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); - if (iov_iter_is_pipe(i)) { - int npages; - - if (!sanity(i)) - return 0; - - pipe_npages(i, &npages); - return min(npages, maxpages); - } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); @@ -1729,17 +1291,13 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; - if (unlikely(iov_iter_is_pipe(new))) { - WARN_ON(1); - return NULL; - } if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ - return new->iov = kmemdup(new->iov, + return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; @@ -1828,6 +1386,30 @@ struct iovec *iovec_from_user(const struct iovec __user *uvec, return iov; } +/* + * Single segment iovec supplied by the user, import it as ITER_UBUF. + */ +static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, + struct iovec **iovp, struct iov_iter *i, + bool compat) +{ + struct iovec *iov = *iovp; + ssize_t ret; + + if (compat) + ret = copy_compat_iovec_from_user(iov, uvec, 1); + else + ret = copy_iovec_from_user(iov, uvec, 1); + if (unlikely(ret)) + return ret; + + ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); + if (unlikely(ret)) + return ret; + *iovp = NULL; + return i->count; +} + ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) @@ -1836,6 +1418,9 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned long seg; struct iovec *iov; + if (nr_segs == 1) + return __import_iovec_ubuf(type, uvec, iovp, i, compat); + iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; @@ -1914,9 +1499,7 @@ int import_single_range(int rw, void __user *buf, size_t len, if (unlikely(!access_ok(buf, len))) return -EFAULT; - iov->iov_base = buf; - iov->iov_len = len; - iov_iter_init(i, rw, iov, 1, len); + iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL(import_single_range); @@ -1966,7 +1549,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else - i->iov -= state->nr_segs - i->nr_segs; + i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } diff --git a/mm/filemap.c b/mm/filemap.c index a34abfe8c654..6f3a7e53fccf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2687,8 +2687,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter->count, &fbatch, - iov_iter_is_pipe(iter)); + error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; @@ -2964,7 +2963,6 @@ out: return total_spliced ? total_spliced : error; } -EXPORT_SYMBOL(filemap_splice_read); static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, diff --git a/mm/madvise.c b/mm/madvise.c index 405a2c4a0a18..24c5cffe3e6c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1444,7 +1444,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; - struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; @@ -1491,12 +1491,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, total_len = iov_iter_count(&iter); while (iov_iter_count(&iter)) { - iovec = iov_iter_iovec(&iter); - ret = do_madvise(mm, (unsigned long)iovec.iov_base, - iovec.iov_len, behavior); + ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), + iter_iov_len(&iter), behavior); if (ret < 0) break; - iov_iter_advance(&iter, iovec.iov_len); + iov_iter_advance(&iter, iter_iov_len(&iter)); } ret = (total_len - iov_iter_count(&iter)) ? : ret; diff --git a/mm/shmem.c b/mm/shmem.c index 4d6b8faa5f2f..16378b281a5d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2846,6 +2846,138 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } +static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return true; +} + +static void zero_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ +} + +static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return false; +} + +static const struct pipe_buf_operations zero_pipe_buf_ops = { + .release = zero_pipe_buf_release, + .try_steal = zero_pipe_buf_try_steal, + .get = zero_pipe_buf_get, +}; + +static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, + loff_t fpos, size_t size) +{ + size_t offset = fpos & ~PAGE_MASK; + + size = min_t(size_t, size, PAGE_SIZE - offset); + + if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + + *buf = (struct pipe_buffer) { + .ops = &zero_pipe_buf_ops, + .page = ZERO_PAGE(0), + .offset = offset, + .len = size, + }; + pipe->head++; + } + + return size; +} + +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct address_space *mapping = inode->i_mapping; + struct folio *folio = NULL; + size_t total_spliced = 0, used, npages, n, part; + loff_t isize; + int error = 0; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_occupancy(pipe->head, pipe->tail); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + + do { + if (*ppos >= i_size_read(inode)) + break; + + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ); + if (error) { + if (error == -EINVAL) + error = 0; + break; + } + if (folio) { + folio_unlock(folio); + + if (folio_test_hwpoison(folio)) { + error = -EIO; + break; + } + } + + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(inode); + if (unlikely(*ppos >= isize)) + break; + part = min_t(loff_t, isize - *ppos, len); + + if (folio) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + folio_mark_accessed(folio); + /* + * Ok, we have the page, and it's up-to-date, so we can + * now splice it into the pipe. + */ + n = splice_folio_into_pipe(pipe, folio, *ppos, part); + folio_put(folio); + folio = NULL; + } else { + n = splice_zeropage_into_pipe(pipe, *ppos, len); + } + + if (!n) + break; + len -= n; + total_spliced += n; + *ppos += n; + in->f_ra.prev_pos = *ppos; + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + break; + + cond_resched(); + } while (len); + + if (folio) + folio_put(folio); + + file_accessed(in); + return total_spliced ? total_spliced : error; +} + static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; @@ -4082,7 +4214,7 @@ static const struct file_operations shmem_file_operations = { .read_iter = shmem_file_read_iter, .write_iter = generic_file_write_iter, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 3d0c4a5b701b..91c87cdb786e 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3521,6 +3521,7 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to) unsigned long i; void __user **bufs; snd_pcm_uframes_t frames; + const struct iovec *iov = iter_iov(to); pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; @@ -3530,18 +3531,20 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to) if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; - if (!iter_is_iovec(to)) + if (!to->user_backed) return -EINVAL; if (to->nr_segs > 1024 || to->nr_segs != runtime->channels) return -EINVAL; - if (!frame_aligned(runtime, to->iov->iov_len)) + if (!frame_aligned(runtime, iov->iov_len)) return -EINVAL; - frames = bytes_to_samples(runtime, to->iov->iov_len); + frames = bytes_to_samples(runtime, iov->iov_len); bufs = kmalloc_array(to->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; - for (i = 0; i < to->nr_segs; ++i) - bufs[i] = to->iov[i].iov_base; + for (i = 0; i < to->nr_segs; ++i) { + bufs[i] = iov->iov_base; + iov++; + } result = snd_pcm_lib_readv(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); @@ -3558,6 +3561,7 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from) unsigned long i; void __user **bufs; snd_pcm_uframes_t frames; + const struct iovec *iov = iter_iov(from); pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; @@ -3567,17 +3571,19 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from) if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; - if (!iter_is_iovec(from)) + if (!from->user_backed) return -EINVAL; if (from->nr_segs > 128 || from->nr_segs != runtime->channels || - !frame_aligned(runtime, from->iov->iov_len)) + !frame_aligned(runtime, iov->iov_len)) return -EINVAL; - frames = bytes_to_samples(runtime, from->iov->iov_len); + frames = bytes_to_samples(runtime, iov->iov_len); bufs = kmalloc_array(from->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; - for (i = 0; i < from->nr_segs; ++i) - bufs[i] = from->iov[i].iov_base; + for (i = 0; i < from->nr_segs; ++i) { + bufs[i] = iov->iov_base; + iov++; + } result = snd_pcm_lib_writev(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); |