summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile2
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/aio.c36
-rw-r--r--fs/attr.c74
-rw-r--r--fs/befs/linuxvfs.c16
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/backref.c88
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/block-group.c1
-rw-r--r--fs/btrfs/compression.c59
-rw-r--r--fs/btrfs/compression.h7
-rw-r--r--fs/btrfs/ctree.h67
-rw-r--r--fs/btrfs/delayed-inode.c364
-rw-r--r--fs/btrfs/delayed-inode.h11
-rw-r--r--fs/btrfs/disk-io.c281
-rw-r--r--fs/btrfs/disk-io.h17
-rw-r--r--fs/btrfs/extent-tree.c78
-rw-r--r--fs/btrfs/extent_io.c684
-rw-r--r--fs/btrfs/extent_io.h6
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/inode.c608
-rw-r--r--fs/btrfs/ioctl.c139
-rw-r--r--fs/btrfs/lzo.c28
-rw-r--r--fs/btrfs/ordered-data.c40
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/raid56.c792
-rw-r--r--fs/btrfs/raid56.h168
-rw-r--r--fs/btrfs/reflink.c13
-rw-r--r--fs/btrfs/scrub.c71
-rw-r--r--fs/btrfs/send.c460
-rw-r--r--fs/btrfs/send.h169
-rw-r--r--fs/btrfs/space-info.c34
-rw-r--r--fs/btrfs/space-info.h4
-rw-r--r--fs/btrfs/struct-funcs.c11
-rw-r--r--fs/btrfs/subpage.c4
-rw-r--r--fs/btrfs/super.c28
-rw-r--r--fs/btrfs/sysfs.c182
-rw-r--r--fs/btrfs/transaction.c22
-rw-r--r--fs/btrfs/tree-log.c20
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/volumes.c358
-rw-r--r--fs/btrfs/volumes.h47
-rw-r--r--fs/btrfs/zoned.c6
-rw-r--r--fs/btrfs/zstd.c42
-rw-r--r--fs/buffer.c363
-rw-r--r--fs/coda/symlink.c11
-rw-r--r--fs/cramfs/inode.c17
-rw-r--r--fs/crypto/fname.c36
-rw-r--r--fs/crypto/fscrypt_private.h11
-rw-r--r--fs/crypto/hooks.c6
-rw-r--r--fs/crypto/keysetup.c7
-rw-r--r--fs/crypto/policy.c49
-rw-r--r--fs/dlm/Kconfig9
-rw-r--r--fs/dlm/Makefile2
-rw-r--r--fs/dlm/ast.c4
-rw-r--r--fs/dlm/config.c21
-rw-r--r--fs/dlm/config.h3
-rw-r--r--fs/dlm/dlm_internal.h32
-rw-r--r--fs/dlm/lock.c143
-rw-r--r--fs/dlm/lock.h17
-rw-r--r--fs/dlm/lockspace.c31
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/dlm/member.c30
-rw-r--r--fs/dlm/plock.c51
-rw-r--r--fs/dlm/recoverd.c35
-rw-r--r--fs/dlm/user.c21
-rw-r--r--fs/efivarfs/Makefile2
-rw-r--r--fs/efivarfs/internal.h40
-rw-r--r--fs/efivarfs/super.c15
-rw-r--r--fs/efivarfs/vars.c738
-rw-r--r--fs/erofs/decompressor_lzma.c1
-rw-r--r--fs/exec.c9
-rw-r--r--fs/ext2/dir.c20
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/inode.c61
-rw-r--r--fs/ext2/namei.c10
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext4/inode.c58
-rw-r--r--fs/f2fs/checkpoint.c4
-rw-r--r--fs/f2fs/compress.c246
-rw-r--r--fs/f2fs/data.c92
-rw-r--r--fs/f2fs/f2fs.h34
-rw-r--r--fs/f2fs/file.c30
-rw-r--r--fs/f2fs/gc.c8
-rw-r--r--fs/f2fs/gc.h21
-rw-r--r--fs/f2fs/node.c12
-rw-r--r--fs/f2fs/recovery.c10
-rw-r--r--fs/f2fs/super.c53
-rw-r--r--fs/fat/file.c9
-rw-r--r--fs/freevxfs/vxfs_immed.c43
-rw-r--r--fs/freevxfs/vxfs_subr.c6
-rw-r--r--fs/fuse/dax.c2
-rw-r--r--fs/fuse/dir.c7
-rw-r--r--fs/fuse/file.c39
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/file.c3
-rw-r--r--fs/gfs2/glock.c7
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/lops.c28
-rw-r--r--fs/hfs/bnode.c4
-rw-r--r--fs/hfsplus/bnode.c4
-rw-r--r--fs/hostfs/hostfs_kern.c6
-rw-r--r--fs/hugetlbfs/inode.c69
-rw-r--r--fs/inode.c178
-rw-r--r--fs/io-wq.c1424
-rw-r--r--fs/io-wq.h228
-rw-r--r--fs/io_uring.c13257
-rw-r--r--fs/iomap/buffered-io.c95
-rw-r--r--fs/iomap/direct-io.c4
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/inode.c18
-rw-r--r--fs/jfs/jfs_metapage.c2
-rw-r--r--fs/ksmbd/smb2pdu.c32
-rw-r--r--fs/ksmbd/transport_rdma.c10
-rw-r--r--fs/ksmbd/transport_tcp.c2
-rw-r--r--fs/ksmbd/vfs.c8
-rw-r--r--fs/mpage.c125
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/read.c4
-rw-r--r--fs/nfs/write.c16
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/nilfs2/page.c60
-rw-r--r--fs/ntfs/aops.c6
-rw-r--r--fs/ntfs/aops.h7
-rw-r--r--fs/ntfs/file.c5
-rw-r--r--fs/ntfs3/attrib.c2
-rw-r--r--fs/ntfs3/bitmap.c4
-rw-r--r--fs/ntfs3/file.c1
-rw-r--r--fs/ntfs3/frecord.c49
-rw-r--r--fs/ntfs3/fslog.c4
-rw-r--r--fs/ntfs3/index.c27
-rw-r--r--fs/ntfs3/inode.c9
-rw-r--r--fs/ntfs3/ntfs_fs.h7
-rw-r--r--fs/ntfs3/record.c5
-rw-r--r--fs/ntfs3/super.c8
-rw-r--r--fs/ntfs3/xattr.c6
-rw-r--r--fs/ocfs2/aops.c28
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/refcounttree.c42
-rw-r--r--fs/ocfs2/slot_map.c46
-rw-r--r--fs/ocfs2/super.c21
-rw-r--r--fs/open.c60
-rw-r--r--fs/orangefs/inode.c4
-rw-r--r--fs/overlayfs/copy_up.c4
-rw-r--r--fs/overlayfs/overlayfs.h12
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/pstore/inode.c1
-rw-r--r--fs/pstore/platform.c64
-rw-r--r--fs/pstore/zone.c12
-rw-r--r--fs/quota/dquot.c17
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/reiserfs/inode.c4
-rw-r--r--fs/reiserfs/xattr.c9
-rw-r--r--fs/remap_range.c11
-rw-r--r--fs/squashfs/file.c15
-rw-r--r--fs/ubifs/file.c29
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/util.c11
-rw-r--r--fs/unicode/mkutf8data.c38
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_item.c43
-rw-r--r--fs/xfs/xfs_file.c11
-rw-r--r--fs/xfs/xfs_icache.c56
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c64
-rw-r--r--fs/xfs/xfs_iomap.c11
-rw-r--r--fs/xfs/xfs_iops.c14
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c9
-rw-r--r--fs/xfs/xfs_super.c9
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/zonefs/super.c4
176 files changed, 5174 insertions, 18782 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 208a74e0b00e..93b80529f8e8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
-obj-$(CONFIG_IO_URING) += io_uring.o
-obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index bbb2c210d139..97f50e9fd9eb 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -132,12 +132,6 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (IS_ERR(page))
return PTR_ERR(page);
- if (PageError(page)) {
- ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
- put_page(page);
- return ret;
- }
-
buf = kmap(page);
ret = -EINVAL;
if (buf[size - 1] == '.')
diff --git a/fs/aio.c b/fs/aio.c
index 3c249b938632..a1911e86859c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -400,8 +400,8 @@ static const struct file_operations aio_ring_fops = {
};
#if IS_ENABLED(CONFIG_MIGRATION)
-static int aio_migratepage(struct address_space *mapping, struct page *new,
- struct page *old, enum migrate_mode mode)
+static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
{
struct kioctx *ctx;
unsigned long flags;
@@ -435,10 +435,10 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
goto out;
}
- idx = old->index;
+ idx = src->index;
if (idx < (pgoff_t)ctx->nr_pages) {
- /* Make sure the old page hasn't already been changed */
- if (ctx->ring_pages[idx] != old)
+ /* Make sure the old folio hasn't already been changed */
+ if (ctx->ring_pages[idx] != &src->page)
rc = -EAGAIN;
} else
rc = -EINVAL;
@@ -447,27 +447,27 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
goto out_unlock;
/* Writeback must be complete */
- BUG_ON(PageWriteback(old));
- get_page(new);
+ BUG_ON(folio_test_writeback(src));
+ folio_get(dst);
- rc = migrate_page_move_mapping(mapping, new, old, 1);
+ rc = folio_migrate_mapping(mapping, dst, src, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
- put_page(new);
+ folio_put(dst);
goto out_unlock;
}
/* Take completion_lock to prevent other writes to the ring buffer
- * while the old page is copied to the new. This prevents new
+ * while the old folio is copied to the new. This prevents new
* events from being lost.
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
- migrate_page_copy(new, old);
- BUG_ON(ctx->ring_pages[idx] != old);
- ctx->ring_pages[idx] = new;
+ folio_migrate_copy(dst, src);
+ BUG_ON(ctx->ring_pages[idx] != &src->page);
+ ctx->ring_pages[idx] = &dst->page;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- /* The old page is no longer accessible. */
- put_page(old);
+ /* The old folio is no longer accessible. */
+ folio_put(src);
out_unlock:
mutex_unlock(&ctx->ring_lock);
@@ -475,13 +475,13 @@ out:
spin_unlock(&mapping->private_lock);
return rc;
}
+#else
+#define aio_migrate_folio NULL
#endif
static const struct address_space_operations aio_ctx_aops = {
.dirty_folio = noop_dirty_folio,
-#if IS_ENABLED(CONFIG_MIGRATION)
- .migratepage = aio_migratepage,
-#endif
+ .migrate_folio = aio_migrate_folio,
};
static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
diff --git a/fs/attr.c b/fs/attr.c
index dbe996b0dedf..b5b8835ddf15 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -22,7 +22,7 @@
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
- * @uid: uid to chown @inode to
+ * @ia_vfsuid: uid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
@@ -31,15 +31,15 @@
* performed on the raw inode simply passs init_user_ns.
*/
static bool chown_ok(struct user_namespace *mnt_userns,
- const struct inode *inode,
- kuid_t uid)
+ const struct inode *inode, vfsuid_t ia_vfsuid)
{
- kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid))
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()) &&
+ vfsuid_eq(ia_vfsuid, vfsuid))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
- if (uid_eq(kuid, INVALID_UID) &&
+ if (!vfsuid_valid(vfsuid) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
@@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns,
* chgrp_ok - verify permissions to chgrp inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
- * @gid: gid to chown @inode to
+ * @ia_vfsgid: gid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
@@ -58,21 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns,
* performed on the raw inode simply passs init_user_ns.
*/
static bool chgrp_ok(struct user_namespace *mnt_userns,
- const struct inode *inode, kgid_t gid)
+ const struct inode *inode, vfsgid_t ia_vfsgid)
{
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
- kgid_t mapped_gid;
-
- if (gid_eq(gid, inode->i_gid))
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) {
+ if (vfsgid_eq(ia_vfsgid, vfsgid))
return true;
- mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid);
- if (in_group_p(mapped_gid))
+ if (vfsgid_in_group_p(ia_vfsgid))
return true;
}
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
- if (gid_eq(kgid, INVALID_GID) &&
+ if (!vfsgid_valid(vfsgid) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
@@ -120,28 +118,29 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
goto kill_priv;
/* Make sure a caller can chown. */
- if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid))
+ if ((ia_valid & ATTR_UID) &&
+ !chown_ok(mnt_userns, inode, attr->ia_vfsuid))
return -EPERM;
/* Make sure caller can chgrp. */
- if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid))
+ if ((ia_valid & ATTR_GID) &&
+ !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid))
return -EPERM;
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) {
- kgid_t mapped_gid;
+ vfsgid_t vfsgid;
if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
if (ia_valid & ATTR_GID)
- mapped_gid = mapped_kgid_fs(mnt_userns,
- i_user_ns(inode), attr->ia_gid);
+ vfsgid = attr->ia_vfsgid;
else
- mapped_gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* Also check the setgid bit! */
- if (!in_group_p(mapped_gid) &&
+ if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
@@ -219,9 +218,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
* setattr_copy must be called with i_mutex held.
*
* setattr_copy updates the inode's metadata with that specified
- * in attr on idmapped mounts. If file ownership is changed setattr_copy
- * doesn't map ia_uid and ia_gid. It will asssume the caller has already
- * provided the intended values. Necessary permission checks to determine
+ * in attr on idmapped mounts. Necessary permission checks to determine
* whether or not the S_ISGID property needs to be removed are performed with
* the correct idmapped mount permission helpers.
* Noticeably missing is inode size update, which is more complex
@@ -242,10 +239,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
{
unsigned int ia_valid = attr->ia_valid;
- if (ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_MTIME)
@@ -254,8 +249,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (!in_group_p(kgid) &&
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
@@ -306,9 +301,6 @@ EXPORT_SYMBOL(may_setattr);
* retry. Because breaking a delegation may take a long time, the
* caller should drop the i_mutex before doing so.
*
- * If file ownership is changed notify_change() doesn't map ia_uid and
- * ia_gid. It will asssume the caller has already provided the intended values.
- *
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported. Also, passing NULL is fine for callers holding
@@ -397,23 +389,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
* namespace of the superblock.
*/
if (ia_valid & ATTR_UID &&
- !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+ !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+ attr->ia_vfsuid))
return -EOVERFLOW;
if (ia_valid & ATTR_GID &&
- !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+ !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+ attr->ia_vfsgid))
return -EOVERFLOW;
/* Don't allow modifications of files with invalid uids or
* gids unless those uids & gids are being made valid.
*/
if (!(ia_valid & ATTR_UID) &&
- !uid_valid(i_uid_into_mnt(mnt_userns, inode)))
+ !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)))
return -EOVERFLOW;
if (!(ia_valid & ATTR_GID) &&
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
- error = security_inode_setattr(dentry, attr);
+ error = security_inode_setattr(mnt_userns, dentry, attr);
if (error)
return error;
error = try_break_deleg(inode, delegated_inode);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index be383fa46b12..32749fcee090 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -108,8 +108,7 @@ static const struct export_operations befs_export_operations = {
* passes it the address of befs_get_block, for mapping file
* positions to disk blocks.
*/
-static int
-befs_read_folio(struct file *file, struct folio *folio)
+static int befs_read_folio(struct file *file, struct folio *folio)
{
return block_read_full_folio(folio, befs_get_block);
}
@@ -470,13 +469,12 @@ befs_destroy_inodecache(void)
*/
static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct befs_inode_info *befs_ino = BEFS_I(inode);
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
- char *link = page_address(page);
+ char *link = folio_address(folio);
if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
@@ -489,12 +487,12 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
goto fail;
}
link[len - 1] = '\0';
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_set_error(folio);
+ folio_unlock(folio);
return -EIO;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 07960529b360..6e2596ddae10 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -13,7 +13,6 @@ struct btrfs_fs_info;
struct btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-typedef void (*btrfs_work_func_t)(struct work_struct *arg);
struct btrfs_work {
btrfs_func_t func;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ebc392ea1d74..d385357e19b6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2028,10 +2028,29 @@ out:
return ret;
}
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct btrfs_data_container *inodes = ctx;
+ const size_t c = 3 * sizeof(u64);
+
+ if (inodes->bytes_left >= c) {
+ inodes->bytes_left -= c;
+ inodes->val[inodes->elem_cnt] = inum;
+ inodes->val[inodes->elem_cnt + 1] = offset;
+ inodes->val[inodes->elem_cnt + 2] = root;
+ inodes->elem_cnt += 3;
+ } else {
+ inodes->bytes_missing += c - inodes->bytes_left;
+ inodes->bytes_left = 0;
+ inodes->elem_missed += 3;
+ }
+
+ return 0;
+}
+
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset)
+ void *ctx, bool ignore_offset)
{
int ret;
u64 extent_item_pos;
@@ -2049,17 +2068,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, search_commit_root,
- iterate, ctx, ignore_offset);
+ build_ino_list, ctx, ignore_offset);
return ret;
}
-typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx);
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, struct inode_fs_paths *ipath);
-static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
{
int ret = 0;
int slot;
@@ -2068,6 +2085,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
u32 name_len;
u64 parent = 0;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
@@ -2103,8 +2122,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
"following ref at offset %u for inode %llu in tree %llu",
cur, found_key.objectid,
fs_root->root_key.objectid);
- ret = iterate(parent, name_len,
- (unsigned long)(iref + 1), eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)(iref + 1), eb, ipath);
if (ret)
break;
len = sizeof(*iref) + name_len;
@@ -2118,15 +2137,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath)
{
int ret;
int slot;
u64 offset = 0;
u64 parent;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_extref *extref;
u32 item_size;
@@ -2162,8 +2181,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
parent = btrfs_inode_extref_parent(eb, extref);
name_len = btrfs_inode_extref_name_len(eb, extref);
- ret = iterate(parent, name_len,
- (unsigned long)&extref->name, eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)&extref->name, eb, ipath);
if (ret)
break;
@@ -2180,34 +2199,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path, iterate_irefs_t *iterate,
- void *ctx)
-{
- int ret;
- int found_refs = 0;
-
- ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
- if (!ret)
- ++found_refs;
- else if (ret != -ENOENT)
- return ret;
-
- ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
- if (ret == -ENOENT && found_refs)
- return 0;
-
- return ret;
-}
-
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx)
+ struct extent_buffer *eb, struct inode_fs_paths *ipath)
{
- struct inode_fs_paths *ipath = ctx;
char *fspath;
char *fspath_min;
int i = ipath->fspath->elem_cnt;
@@ -2248,8 +2246,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
*/
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
- return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
- inode_to_path, ipath);
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, ipath);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, ipath);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
}
struct btrfs_data_container *init_data_container(u32 total_bytes)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ba454032dbe2..2759de7d324c 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -35,8 +35,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
bool ignore_offset);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
+ struct btrfs_path *path, void *ctx,
bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ede389f2602d..13358fbc1629 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3761,6 +3761,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f4564f32f6d9..907fc8a4c092 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -147,12 +147,10 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
u64 disk_start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
const u32 csum_size = fs_info->csum_size;
const u32 sectorsize = fs_info->sectorsize;
struct page *page;
unsigned int i;
- char *kaddr;
u8 csum[BTRFS_CSUM_SIZE];
struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
@@ -161,8 +159,6 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
return 0;
- shash->tfm = fs_info->csum_shash;
-
for (i = 0; i < cb->nr_pages; i++) {
u32 pg_offset;
u32 bytes_left = PAGE_SIZE;
@@ -175,12 +171,11 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
/* Hash through the page sector by sector */
for (pg_offset = 0; pg_offset < bytes_left;
pg_offset += sectorsize) {
- kaddr = kmap_atomic(page);
- crypto_shash_digest(shash, kaddr + pg_offset,
- sectorsize, csum);
- kunmap_atomic(kaddr);
+ int ret;
- if (memcmp(&csum, cb_sum, csum_size) != 0) {
+ ret = btrfs_check_sector_csum(fs_info, page, pg_offset,
+ csum, cb_sum);
+ if (ret) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
if (btrfs_bio(bio)->device)
@@ -403,6 +398,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
kfree(cb);
}
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+ struct compressed_bio *cb =
+ container_of(work, struct compressed_bio, write_end_work);
+
+ finish_compressed_bio_write(cb);
+}
+
/*
* Do the cleanup once all the compressed pages hit the disk. This will clear
* writeback on the file pages and free the compressed pages.
@@ -414,29 +417,15 @@ static void end_compressed_bio_write(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
- if (!dec_and_test_compressed_bio(cb, bio))
- goto out;
-
- btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ if (dec_and_test_compressed_bio(cb, bio)) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- finish_compressed_bio_write(cb);
-out:
+ btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ }
bio_put(bio);
}
-static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
- struct bio *bio, int mirror_num)
-{
- blk_status_t ret;
-
- ASSERT(bio->bi_iter.bi_size);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- return ret;
-}
-
/*
* Allocate a compressed_bio, which will be used to read/write on-disk
* (aka, compressed) * data.
@@ -533,7 +522,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
cb->writeback = writeback;
- cb->orig_bio = NULL;
+ INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
if (blkcg_css)
@@ -603,9 +592,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
goto finish_cb;
}
- ret = submit_compressed_bio(fs_info, bio, 0);
- if (ret)
- goto finish_cb;
+ ASSERT(bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, bio, 0);
bio = NULL;
}
cond_resched();
@@ -765,7 +753,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int zeros;
zeros = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, zeros);
- flush_dcache_page(page);
}
}
@@ -941,9 +928,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
- if (ret)
- goto finish_cb;
+ ASSERT(comp_bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, comp_bio, mirror_num);
comp_bio = NULL;
}
}
@@ -1481,7 +1467,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
ASSERT(copy_start - decompressed < buf_len);
memcpy_to_page(bvec.bv_page, bvec.bv_offset,
buf + copy_start - decompressed, copy_len);
- flush_dcache_page(bvec.bv_page);
cur_offset += copy_len;
bio_advance(orig_bio, copy_len);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 2707404389a5..5fca7603e928 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -61,8 +61,11 @@ struct compressed_bio {
blk_status_t status;
int mirror_num;
- /* for reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
+ union {
+ /* For reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+ struct work_struct write_end_work;
+ };
/*
* the start of a variable length array of checksums only
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 415bf1823fb3..4e2569f84aab 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -230,6 +230,13 @@ struct btrfs_root_backup {
#define BTRFS_SUPER_INFO_SIZE 4096
/*
+ * The reserved space at the beginning of each device.
+ * It covers the primary super block and leaves space for potential use by other
+ * tools like bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
+
+/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
*/
@@ -248,8 +255,12 @@ struct btrfs_super_block {
__le64 chunk_root;
__le64 log_root;
- /* this will help find the new super based on the log root */
- __le64 log_root_transid;
+ /*
+ * This member has never been utilized since the very beginning, thus
+ * it's always 0 regardless of kernel version. We always use
+ * generation + 1 to read log tree root. So here we mark it deprecated.
+ */
+ __le64 __unused_log_root_transid;
__le64 total_bytes;
__le64 bytes_used;
__le64 root_dir_objectid;
@@ -656,6 +667,18 @@ enum btrfs_exclusive_operation {
BTRFS_EXCLOP_SWAP_ACTIVATE,
};
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+ /* Total number of commits */
+ u64 commit_count;
+ /* The maximum commit duration so far in ns */
+ u64 max_commit_dur;
+ /* The last commit duration in ns */
+ u64 last_commit_dur;
+ /* The total commit duration in ns */
+ u64 total_commit_dur;
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -851,11 +874,11 @@ struct btrfs_fs_info {
struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
- struct btrfs_workqueue *endio_workers;
- struct btrfs_workqueue *endio_meta_workers;
- struct btrfs_workqueue *endio_raid56_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
+ struct workqueue_struct *endio_raid56_workers;
struct workqueue_struct *rmw_workers;
- struct btrfs_workqueue *endio_meta_write_workers;
+ struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
@@ -1065,6 +1088,9 @@ struct btrfs_fs_info {
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
+ /* Updates are not protected by any lock */
+ struct btrfs_commit_stats commit_stats;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -2477,8 +2503,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
chunk_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
log_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
- log_root_transid, 64);
BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
log_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
@@ -2735,8 +2759,16 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
+static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums,
+ u64 offset)
+{
+ u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
+
+ return csums + offset_in_sectors * fs_info->csum_size;
+}
+
/*
- * Take the number of bytes to be checksummmed and figure out how many leaves
+ * Take the number of bytes to be checksummed and figure out how many leaves
* it would require to store the csums for that many bytes.
*/
static inline u64 btrfs_csum_bytes_to_leaves(
@@ -3253,8 +3285,11 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type);
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
@@ -3307,9 +3342,9 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits);
+ u32 bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, u32 bits);
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
@@ -3355,6 +3390,12 @@ int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 66779ab3ed4a..ad4fa8c173fa 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node(
INIT_LIST_HEAD(&delayed_node->p_list);
}
-static inline int btrfs_is_continuous_delayed_item(
- struct btrfs_delayed_item *item1,
- struct btrfs_delayed_item *item2)
-{
- if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
- item1->key.objectid == item2->key.objectid &&
- item1->key.type == item2->key.type &&
- item1->key.offset + 1 == item2->key.offset)
- return 1;
- return 0;
-}
-
static struct btrfs_delayed_node *btrfs_get_delayed_node(
struct btrfs_inode *btrfs_inode)
{
@@ -391,8 +379,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
}
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
- struct btrfs_delayed_item *ins,
- int action)
+ struct btrfs_delayed_item *ins)
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
@@ -401,9 +388,9 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
int cmp;
bool leftmost = true;
- if (action == BTRFS_DELAYED_INSERTION_ITEM)
+ if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
- else if (action == BTRFS_DELAYED_DELETION_ITEM)
+ else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
root = &delayed_node->del_root;
else
BUG();
@@ -429,32 +416,19 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
ins->delayed_node = delayed_node;
- ins->ins_or_del = action;
- if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
- action == BTRFS_DELAYED_INSERTION_ITEM &&
+ /* Delayed items are always for dir index items. */
+ ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY);
+
+ if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM &&
ins->key.offset >= delayed_node->index_cnt)
- delayed_node->index_cnt = ins->key.offset + 1;
+ delayed_node->index_cnt = ins->key.offset + 1;
delayed_node->count++;
atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
return 0;
}
-static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_INSERTION_ITEM);
-}
-
-static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_DELETION_ITEM);
-}
-
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
@@ -566,7 +540,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid,
num_bytes, 1);
- item->bytes_reserved = num_bytes;
+ /*
+ * For insertions we track reserved metadata space by accounting
+ * for the number of leaves that will be used, based on the delayed
+ * node's index_items_size field.
+ */
+ if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
+ item->bytes_reserved = num_bytes;
}
return ret;
@@ -592,6 +572,21 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
+static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
+ unsigned int num_leaves)
+{
+ struct btrfs_fs_info *fs_info = node->root->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
+
+ /* There are no space reservations during log replay, bail out. */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
+ bytes, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
+}
+
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -665,23 +660,41 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
}
/*
- * Insert a single delayed item or a batch of delayed items that have consecutive
- * keys if they exist.
+ * Insert a single delayed item or a batch of delayed items, as many as possible
+ * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
+ * in the rbtree, and if there's a gap between two consecutive dir index items,
+ * then it means at some point we had delayed dir indexes to add but they got
+ * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
+ * into the subvolume tree. Dir index keys also have their offsets coming from a
+ * monotonically increasing counter, so we can't get new keys with an offset that
+ * fits within a gap between delayed dir index items.
*/
static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_delayed_item *first_item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_delayed_node *node = first_item->delayed_node;
LIST_HEAD(item_list);
struct btrfs_delayed_item *curr;
struct btrfs_delayed_item *next;
- const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_item_batch batch;
int total_size;
char *ins_data = NULL;
int ret;
+ lockdep_assert_held(&node->mutex);
+
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(first_item->bytes_reserved == 0);
+
list_add_tail(&first_item->tree_list, &item_list);
batch.total_data_size = first_item->data_len;
batch.nr = 1;
@@ -692,9 +705,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
int next_size;
next = __btrfs_next_delayed_item(curr);
- if (!next || !btrfs_is_continuous_delayed_item(curr, next))
+ if (!next)
break;
+ ASSERT(next->bytes_reserved == 0);
+
next_size = next->data_len + sizeof(struct btrfs_item);
if (total_size + next_size > max_size)
break;
@@ -751,9 +766,31 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
*/
btrfs_release_path(path);
+ ASSERT(node->index_item_leaves > 0);
+
+ if (next) {
+ /*
+ * We inserted one batch of items into a leaf a there are more
+ * items to flush in a future batch, now release one unit of
+ * metadata space from the delayed block reserve, corresponding
+ * the leaf we just flushed to.
+ */
+ btrfs_delayed_item_release_leaves(node, 1);
+ node->index_item_leaves--;
+ } else {
+ /*
+ * There are no more items to insert. We can have a number of
+ * reserved leaves > 1 here - this happens when many dir index
+ * items are added and then removed before they are flushed (file
+ * names with a very short life, never span a transaction). So
+ * release all remaining leaves.
+ */
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
- btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
}
out:
@@ -789,62 +826,75 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- struct list_head head;
- int nitems, i, last_item;
- int ret = 0;
+ struct extent_buffer *leaf = path->nodes[0];
+ LIST_HEAD(batch_list);
+ int nitems, slot, last_slot;
+ int ret;
+ u64 total_reserved_size = item->bytes_reserved;
- BUG_ON(!path->nodes[0]);
+ ASSERT(leaf != NULL);
- leaf = path->nodes[0];
+ slot = path->slots[0];
+ last_slot = btrfs_header_nritems(leaf) - 1;
+ /*
+ * Our caller always gives us a path pointing to an existing item, so
+ * this can not happen.
+ */
+ ASSERT(slot <= last_slot);
+ if (WARN_ON(slot > last_slot))
+ return -ENOENT;
- i = path->slots[0];
- last_item = btrfs_header_nritems(leaf) - 1;
- if (i > last_item)
- return -ENOENT; /* FIXME: Is errno suitable? */
+ nitems = 1;
+ curr = item;
+ list_add_tail(&curr->tree_list, &batch_list);
- next = item;
- INIT_LIST_HEAD(&head);
- btrfs_item_key_to_cpu(leaf, &key, i);
- nitems = 0;
/*
- * count the number of the dir index items that we can delete in batch
+ * Keep checking if the next delayed item matches the next item in the
+ * leaf - if so, we can add it to the batch of items to delete from the
+ * leaf.
*/
- while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
- list_add_tail(&next->tree_list, &head);
- nitems++;
+ while (slot < last_slot) {
+ struct btrfs_key key;
- curr = next;
next = __btrfs_next_delayed_item(curr);
if (!next)
break;
- if (!btrfs_is_continuous_delayed_item(curr, next))
- break;
-
- i++;
- if (i > last_item)
+ slot++;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (btrfs_comp_cpu_keys(&next->key, &key) != 0)
break;
- btrfs_item_key_to_cpu(leaf, &key, i);
+ nitems++;
+ curr = next;
+ list_add_tail(&curr->tree_list, &batch_list);
+ total_reserved_size += curr->bytes_reserved;
}
- if (!nitems)
- return 0;
-
ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
if (ret)
- goto out;
+ return ret;
+
+ /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
+ if (total_reserved_size > 0) {
+ /*
+ * Check btrfs_delayed_item_reserve_metadata() to see why we
+ * don't need to release/reserve qgroup space.
+ */
+ trace_btrfs_space_reservation(fs_info, "delayed_item",
+ item->key.objectid, total_reserved_size,
+ 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
+ total_reserved_size, NULL);
+ }
- list_for_each_entry_safe(curr, next, &head, tree_list) {
- btrfs_delayed_item_release_metadata(root, curr);
+ list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
-out:
- return ret;
+ return 0;
}
static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
@@ -852,43 +902,52 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
- struct btrfs_delayed_item *curr, *prev;
int ret = 0;
-do_again:
- mutex_lock(&node->mutex);
- curr = __btrfs_first_delayed_deletion_item(node);
- if (!curr)
- goto delete_fail;
+ while (ret == 0) {
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_deletion_item(node);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+
+ ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1);
+ if (ret > 0) {
+ /*
+ * There's no matching item in the leaf. This means we
+ * have already deleted this item in a past run of the
+ * delayed items. We ignore errors when running delayed
+ * items from an async context, through a work queue job
+ * running btrfs_async_run_delayed_root(), and don't
+ * release delayed items that failed to complete. This
+ * is because we will retry later, and at transaction
+ * commit time we always run delayed items and will
+ * then deal with errors if they fail to run again.
+ *
+ * So just release delayed items for which we can't find
+ * an item in the tree, and move to the next item.
+ */
+ btrfs_release_path(path);
+ btrfs_release_delayed_item(item);
+ ret = 0;
+ } else if (ret == 0) {
+ ret = btrfs_batch_delete_items(trans, root, path, item);
+ btrfs_release_path(path);
+ }
- ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
- if (ret < 0)
- goto delete_fail;
- else if (ret > 0) {
/*
- * can't find the item which the node points to, so this node
- * is invalid, just drop it.
+ * We unlock and relock on each iteration, this is to prevent
+ * blocking other tasks for too long while we are being run from
+ * the async context (work queue job). Those tasks are typically
+ * running system calls like creat/mkdir/rename/unlink/etc which
+ * need to add delayed items to this delayed node.
*/
- prev = curr;
- curr = __btrfs_next_delayed_item(prev);
- btrfs_release_delayed_item(prev);
- ret = 0;
- btrfs_release_path(path);
- if (curr) {
- mutex_unlock(&node->mutex);
- goto do_again;
- } else
- goto delete_fail;
+ mutex_unlock(&node->mutex);
}
- btrfs_batch_delete_items(trans, root, path, curr);
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
- goto do_again;
-
-delete_fail:
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
return ret;
}
@@ -1347,9 +1406,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_disk_key *disk_key, u8 type,
u64 index)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *delayed_item;
struct btrfs_dir_item *dir_item;
+ bool reserve_leaf_space;
+ u32 data_len;
int ret;
delayed_node = btrfs_get_or_create_delayed_node(dir);
@@ -1365,6 +1428,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
delayed_item->key.objectid = btrfs_ino(dir);
delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
delayed_item->key.offset = index;
+ delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
dir_item->location = *disk_key;
@@ -1374,15 +1438,52 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_type(dir_item, type);
memcpy((char *)(dir_item + 1), name, name_len);
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
- /*
- * we have reserved enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
+ data_len = delayed_item->data_len + sizeof(struct btrfs_item);
mutex_lock(&delayed_node->mutex);
- ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+
+ if (delayed_node->index_item_leaves == 0 ||
+ delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
+ delayed_node->curr_index_batch_size = data_len;
+ reserve_leaf_space = true;
+ } else {
+ delayed_node->curr_index_batch_size += data_len;
+ reserve_leaf_space = false;
+ }
+
+ if (reserve_leaf_space) {
+ ret = btrfs_delayed_item_reserve_metadata(trans, dir->root,
+ delayed_item);
+ /*
+ * Space was reserved for a dir index item insertion when we
+ * started the transaction, so getting a failure here should be
+ * impossible.
+ */
+ if (WARN_ON(ret)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_item(delayed_item);
+ goto release_node;
+ }
+
+ delayed_node->index_item_leaves++;
+ } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ /*
+ * Adding the new dir index item does not require touching another
+ * leaf, so we can release 1 unit of metadata that was previously
+ * reserved when starting the transaction. This applies only to
+ * the case where we had a transaction start and excludes the
+ * transaction join case (when replaying log trees).
+ */
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, bytes, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+ ASSERT(trans->bytes_reserved >= bytes);
+ trans->bytes_reserved -= bytes;
+ }
+
+ ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1410,8 +1511,37 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
return 1;
}
- btrfs_delayed_item_release_metadata(node->root, item);
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(item->bytes_reserved == 0);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * If there's only one leaf reserved, we can decrement this item from the
+ * current batch, otherwise we can not because we don't know which leaf
+ * it belongs to. With the current limit on delayed items, we rarely
+ * accumulate enough dir index items to fill more than one leaf (even
+ * when using a leaf size of 4K).
+ */
+ if (node->index_item_leaves == 1) {
+ const u32 data_len = item->data_len + sizeof(struct btrfs_item);
+
+ ASSERT(node->curr_index_batch_size >= data_len);
+ node->curr_index_batch_size -= data_len;
+ }
+
btrfs_release_delayed_item(item);
+
+ /* If we now have no more dir index items, we can release all leaves. */
+ if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
mutex_unlock(&node->mutex);
return 0;
}
@@ -1444,6 +1574,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
}
item->key = item_key;
+ item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM;
ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
/*
@@ -1458,7 +1589,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
}
mutex_lock(&node->mutex);
- ret = __btrfs_add_delayed_deletion_item(node, item);
+ ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1826,12 +1957,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
mutex_lock(&delayed_node->mutex);
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
}
+ if (delayed_node->index_item_leaves > 0) {
+ btrfs_delayed_item_release_leaves(delayed_node,
+ delayed_node->index_item_leaves);
+ delayed_node->index_item_leaves = 0;
+ }
+
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
while (curr_item) {
btrfs_delayed_item_release_metadata(root, curr_item);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index b2412160c5bc..9795dc295a18 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -58,6 +58,17 @@ struct btrfs_delayed_node {
u64 index_cnt;
unsigned long flags;
int count;
+ /*
+ * The size of the next batch of dir index items to insert (if this
+ * node is from a directory inode). Protected by @mutex.
+ */
+ u32 curr_index_batch_size;
+ /*
+ * Number of leaves reserved for inserting dir index items (if this
+ * node belongs to a directory inode). This may be larger then the
+ * actual number of leaves we end up using. Protected by @mutex.
+ */
+ u32 index_item_leaves;
};
struct btrfs_delayed_item {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ba005c41983..524fc8014d14 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
@@ -63,40 +62,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
-/*
- * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * is complete. This is used during reads to verify checksums, and it is used
- * by writes to insert metadata for new file extents after IO is complete.
- */
-struct btrfs_end_io_wq {
- struct bio *bio;
- bio_end_io_t *end_io;
- void *private;
- struct btrfs_fs_info *info;
- blk_status_t status;
- enum btrfs_wq_endio_type metadata;
- struct btrfs_work work;
-};
-
-static struct kmem_cache *btrfs_end_io_wq_cache;
-
-int __init btrfs_end_io_wq_init(void)
-{
- btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
- sizeof(struct btrfs_end_io_wq),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_end_io_wq_cache)
- return -ENOMEM;
- return 0;
-}
-
-void __cold btrfs_end_io_wq_exit(void)
-{
- kmem_cache_destroy(btrfs_end_io_wq_cache);
-}
-
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
if (fs_info->csum_shash)
@@ -255,8 +220,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
goto out;
}
btrfs_err_rl(eb->fs_info,
- "parent transid verify failed on %llu wanted %llu found %llu",
- eb->start,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror,
parent_transid, btrfs_header_generation(eb));
ret = 1;
clear_extent_buffer_uptodate(eb);
@@ -586,21 +551,23 @@ static int validate_extent_buffer(struct extent_buffer *eb)
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
- eb->start, found_start);
+ btrfs_err_rl(fs_info,
+ "bad tree block start, mirror %u want %llu have %llu",
+ eb->read_mirror, eb->start, found_start);
ret = -EIO;
goto out;
}
if (check_tree_block_fsid(eb)) {
- btrfs_err_rl(fs_info, "bad fsid on block %llu",
- eb->start);
+ btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
ret = -EIO;
goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "bad tree block level %d on %llu",
- (int)btrfs_header_level(eb), eb->start);
+ btrfs_err(fs_info,
+ "bad tree block level, mirror %u level %d on logical %llu",
+ eb->read_mirror, btrfs_header_level(eb), eb->start);
ret = -EIO;
goto out;
}
@@ -611,8 +578,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
- "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
- eb->start,
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+ eb->start, eb->read_mirror,
CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
@@ -637,8 +604,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
set_extent_buffer_uptodate(eb);
else
btrfs_err(fs_info,
- "block=%llu read time tree block corruption detected",
- eb->start);
+ "read time tree block corruption detected on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
out:
return ret;
}
@@ -739,58 +706,6 @@ err:
return ret;
}
-static void end_workqueue_bio(struct bio *bio)
-{
- struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
- struct btrfs_fs_info *fs_info;
- struct btrfs_workqueue *wq;
-
- fs_info = end_io_wq->info;
- end_io_wq->status = bio->bi_status;
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- wq = fs_info->endio_meta_write_workers;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- wq = fs_info->endio_freespace_worker;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else
- wq = fs_info->endio_write_workers;
- } else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else if (end_io_wq->metadata)
- wq = fs_info->endio_meta_workers;
- else
- wq = fs_info->endio_workers;
- }
-
- btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
- btrfs_queue_work(wq, &end_io_wq->work);
-}
-
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata)
-{
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
- if (!end_io_wq)
- return BLK_STS_RESOURCE;
-
- end_io_wq->private = bio->bi_private;
- end_io_wq->end_io = bio->bi_end_io;
- end_io_wq->info = info;
- end_io_wq->status = 0;
- end_io_wq->bio = bio;
- end_io_wq->metadata = metadata;
-
- bio->bi_private = end_io_wq;
- bio->bi_end_io = end_workqueue_bio;
- return 0;
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -815,7 +730,6 @@ static void run_one_async_done(struct btrfs_work *work)
{
struct async_submit_bio *async;
struct inode *inode;
- blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
inode = async->inode;
@@ -833,11 +747,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
- ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
- if (ret) {
- async->bio->bi_status = ret;
- bio_endio(async->bio);
- }
+ btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -848,16 +758,23 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start)
+/*
+ * Submit bio to an async queue.
+ *
+ * Retrun:
+ * - true if the work has been succesfuly submitted
+ * - false in case of error
+ */
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
- return BLK_STS_RESOURCE;
+ return false;
async->inode = inode;
async->bio = bio;
@@ -875,7 +792,7 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
btrfs_queue_work(fs_info->hipri_workers, &async->work);
else
btrfs_queue_work(fs_info->workers, &async->work);
- return 0;
+ return true;
}
static blk_status_t btree_csum_one_bio(struct bio *bio)
@@ -901,7 +818,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
{
/*
* when we're called for a write, we're already in the async
- * submission context. Just jump into btrfs_map_bio
+ * submission context. Just jump into btrfs_submit_bio.
*/
return btree_csum_one_bio(bio);
}
@@ -923,57 +840,54 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
+ bio->bi_opf |= REQ_META;
+
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- /*
- * called for a read, do the setup so that checksum validation
- * can happen in the async kernel threads
- */
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_METADATA);
- if (!ret)
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
- ret = btree_csum_one_bio(bio);
- if (!ret)
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else {
- /*
- * kthread helpers are used to submit writes so that
- * checksumming can happen in parallel across all CPUs
- */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- btree_submit_bio_start);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
}
+ /*
+ * Kthread helpers are used to submit writes so that checksumming can
+ * happen in parallel across all CPUs.
+ */
+ if (should_async_write(fs_info, BTRFS_I(inode)) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ return;
+
+ ret = btree_csum_one_bio(bio);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
+ return;
}
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
#ifdef CONFIG_MIGRATION
-static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page,
- enum migrate_mode mode)
+static int btree_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
* we haven't done the locking hook
*/
- if (PageDirty(page))
+ if (folio_test_dirty(src))
return -EAGAIN;
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_get_private(src) &&
+ !filemap_release_folio(src, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page, mode);
+ return migrate_folio(mapping, dst, src, mode);
}
+#else
+#define btree_migrate_folio NULL
#endif
-
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -1073,10 +987,8 @@ static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.release_folio = btree_release_folio,
.invalidate_folio = btree_invalidate_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = btree_migratepage,
-#endif
- .dirty_folio = btree_dirty_folio,
+ .migrate_folio = btree_migrate_folio,
+ .dirty_folio = btree_dirty_folio,
};
struct extent_buffer *btrfs_find_create_tree_block(
@@ -1864,7 +1776,7 @@ again:
fail:
/*
* If our caller provided us an anonymous device, then it's his
- * responsability to free it in case we fail. So we have to set our
+ * responsibility to free it in case we fail. So we have to set our
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
@@ -1947,25 +1859,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
return root;
}
-/*
- * called by the kthread helper functions to finally call the bio end_io
- * functions. This is where read checksum verification actually happens
- */
-static void end_workqueue_fn(struct btrfs_work *work)
-{
- struct bio *bio;
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
- bio = end_io_wq->bio;
-
- bio->bi_status = end_io_wq->status;
- bio->bi_private = end_io_wq->private;
- bio->bi_end_io = end_io_wq->end_io;
- bio_endio(bio);
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-}
-
static int cleaner_kthread(void *arg)
{
struct btrfs_fs_info *fs_info = arg;
@@ -2272,10 +2165,14 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->delalloc_workers);
btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
- btrfs_destroy_workqueue(fs_info->endio_workers);
- btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ if (fs_info->endio_workers)
+ destroy_workqueue(fs_info->endio_workers);
+ if (fs_info->endio_raid56_workers)
+ destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->compressed_write_workers)
+ destroy_workqueue(fs_info->compressed_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2289,8 +2186,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
* the queues used for metadata I/O, since tasks from those other work
* queues can do metadata I/O operations.
*/
- btrfs_destroy_workqueue(fs_info->endio_meta_workers);
- btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ if (fs_info->endio_meta_workers)
+ destroy_workqueue(fs_info->endio_meta_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2462,25 +2359,18 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
- /*
- * endios are largely parallel and should have a very
- * low idle thresh
- */
fs_info->endio_workers =
- btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
+ alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
- max_active, 4);
- fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
- max_active, 2);
+ alloc_workqueue("btrfs-endio-meta", flags, max_active);
fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
- max_active, 4);
+ alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
+ fs_info->compressed_write_workers =
+ alloc_workqueue("btrfs-compressed-write", flags, max_active);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
@@ -2495,7 +2385,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
if (!(fs_info->workers && fs_info->hipri_workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->endio_meta_write_workers &&
+ fs_info->compressed_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
@@ -3655,6 +3545,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
err = -EINVAL;
goto fail_alloc;
}
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata write, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (unlikely(features && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ features);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
@@ -4178,7 +4082,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
for (i = 0; i < max_mirrors; i++) {
- struct page *page;
+ struct folio *folio;
ret = btrfs_sb_log_location(device, i, READ, &bytenr);
if (ret == -ENOENT) {
@@ -4193,27 +4097,24 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
device->commit_total_bytes)
break;
- page = find_get_page(device->bdev->bd_inode->i_mapping,
+ folio = filemap_get_folio(device->bdev->bd_inode->i_mapping,
bytenr >> PAGE_SHIFT);
- if (!page) {
+ if (!folio) {
errors++;
if (i == 0)
primary_failed = true;
continue;
}
- /* Page is submitted locked and unlocked once the IO completes */
- wait_on_page_locked(page);
- if (PageError(page)) {
+ /* Folio is unlocked once the IO completes */
+ folio_wait_locked(folio);
+ if (!folio_test_uptodate(folio)) {
errors++;
if (i == 0)
primary_failed = true;
}
- /* Drop our reference */
- put_page(page);
-
- /* Drop the reference from the writing run */
- put_page(page);
+ /* Drop our reference and the one from the writing run */
+ folio_put_refs(folio, 2);
}
/* log error, force error return */
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4ee8c42c9f78..8993b428e09c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -17,13 +17,6 @@
*/
#define BTRFS_BDEV_BLOCKSIZE (4096)
-enum btrfs_wq_endio_type {
- BTRFS_WQ_ENDIO_DATA,
- BTRFS_WQ_ENDIO_METADATA,
- BTRFS_WQ_ENDIO_FREE_SPACE,
- BTRFS_WQ_ENDIO_RAID56,
-};
-
static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = SZ_16K;
@@ -121,11 +114,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
int level, struct btrfs_key *first_key);
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata);
-blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start);
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
@@ -145,8 +136,6 @@ int btree_lock_page_hook(struct page *page, void *data,
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);
-int __init btrfs_end_io_wq_init(void);
-void __cold btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4157ecc27d4b..f97a0f28f464 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1269,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
-static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1316,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_io_context *bioc = NULL;
/*
- * Avoid races with device replace and make sure our bioc has devices
- * associated to its stripes that don't go away while we are discarding.
+ * Avoid races with device replace and make sure the devices in the
+ * stripes don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_io_stripe *stripe;
+ struct btrfs_discard_stripe *stripes;
+ unsigned int num_stripes;
int i;
num_bytes = end - cur;
- /* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bioc, 0);
- /*
- * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
- * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
- * thus we can't continue anyway.
- */
- if (ret < 0)
- goto out;
+ stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+ if (IS_ERR(stripes)) {
+ ret = PTR_ERR(stripes);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ break;
+ }
- stripe = bioc->stripes;
- for (i = 0; i < bioc->num_stripes; i++, stripe++) {
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_discard_stripe *stripe = stripes + i;
u64 bytes;
- struct btrfs_device *device = stripe->dev;
- if (!device->bdev) {
+ if (!stripe->dev->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &stripe->dev->dev_state))
continue;
ret = do_discard_extent(stripe, &bytes);
- if (!ret) {
- discarded_bytes += bytes;
- } else if (ret != -EOPNOTSUPP) {
+ if (ret) {
/*
- * Logic errors or -ENOMEM, or -EIO, but
- * unlikely to happen.
- *
- * And since there are two loops, explicitly
- * go to out to avoid confusion.
+ * Keep going if discard is not supported by the
+ * device.
*/
- btrfs_put_bioc(bioc);
- goto out;
+ if (ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
+ } else {
+ discarded_bytes += bytes;
}
-
- /*
- * Just in case we get back EOPNOTSUPP for some reason,
- * just ignore the return value so we don't screw up
- * people calling discard_extent.
- */
- ret = 0;
}
- btrfs_put_bioc(bioc);
+ kfree(stripes);
+ if (ret)
+ break;
cur += num_bytes;
}
-out:
btrfs_bio_counter_dec(fs_info);
-
if (actual_bytes)
*actual_bytes = discarded_bytes;
-
-
- if (ret == -EOPNOTSUPP)
- ret = 0;
return ret;
}
@@ -5992,7 +5976,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
*/
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
{
- u64 start = SZ_1M, len = 0, end = 0;
+ u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
int ret;
*trimmed = 0;
@@ -6036,8 +6020,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
break;
}
- /* Ensure we skip the reserved area in the first 1M */
- start = max_t(u64, start, SZ_1M);
+ /* Ensure we skip the reserved space on each device. */
+ start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
/*
* If find_first_clear_extent_bit find a range that spans the
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 04e36343da3a..0681c0da7926 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -144,6 +144,7 @@ struct tree_entry {
*/
struct btrfs_bio_ctrl {
struct bio *bio;
+ int mirror_num;
enum btrfs_compression_type compress_type;
u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
@@ -178,61 +179,52 @@ static int add_extent_changeset(struct extent_state *state, u32 bits,
return ret;
}
-static void submit_one_bio(struct bio *bio, int mirror_num,
- enum btrfs_compression_type compress_type)
+static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
- struct extent_io_tree *tree = bio->bi_private;
+ struct bio *bio;
+ struct inode *inode;
+ int mirror_num;
+
+ if (!bio_ctrl->bio)
+ return;
- bio->bi_private = NULL;
+ bio = bio_ctrl->bio;
+ inode = bio_first_page_all(bio)->mapping->host;
+ mirror_num = bio_ctrl->mirror_num;
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
- if (is_data_inode(tree->private_data))
- btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
- compress_type);
+ if (!is_data_inode(inode))
+ btrfs_submit_metadata_bio(inode, bio, mirror_num);
+ else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_submit_data_write_bio(inode, bio, mirror_num);
else
- btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
- /*
- * Above submission hooks will handle the error by ending the bio,
- * which will do the cleanup properly. So here we should not return
- * any error, or the caller of submit_extent_page() will do cleanup
- * again, causing problems.
- */
-}
+ btrfs_submit_data_read_bio(inode, bio, mirror_num,
+ bio_ctrl->compress_type);
-/* Cleanup unsubmitted bios */
-static void end_write_bio(struct extent_page_data *epd, int ret)
-{
- struct bio *bio = epd->bio_ctrl.bio;
-
- if (bio) {
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
- epd->bio_ctrl.bio = NULL;
- }
+ /* The bio is owned by the bi_end_io handler now */
+ bio_ctrl->bio = NULL;
}
/*
- * Submit bio from extent page data via submit_one_bio
- *
- * Return 0 if everything is OK.
- * Return <0 for error.
+ * Submit or fail the current bio in an extent_page_data structure.
*/
-static void flush_write_bio(struct extent_page_data *epd)
+static void submit_write_bio(struct extent_page_data *epd, int ret)
{
struct bio *bio = epd->bio_ctrl.bio;
- if (bio) {
- submit_one_bio(bio, 0, 0);
- /*
- * Clean up of epd->bio is handled by its endio function.
- * And endio is either triggered by successful bio execution
- * or the error handler of submit bio hook.
- * So at this point, no matter what happened, we don't need
- * to clean up epd->bio.
- */
+ if (!bio)
+ return;
+
+ if (ret) {
+ ASSERT(ret < 0);
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+ /* The bio is owned by the bi_end_io handler now */
epd->bio_ctrl.bio = NULL;
+ } else {
+ submit_one_bio(&epd->bio_ctrl);
}
}
@@ -376,131 +368,121 @@ void free_extent_state(struct extent_state *state)
}
}
-static struct rb_node *tree_insert(struct rb_root *root,
- struct rb_node *search_start,
- u64 offset,
- struct rb_node *node,
- struct rb_node ***p_in,
- struct rb_node **parent_in)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct tree_entry *entry;
-
- if (p_in && parent_in) {
- p = *p_in;
- parent = *parent_in;
- goto do_insert;
- }
-
- p = search_start ? &search_start : &root->rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- p = &(*p)->rb_left;
- else if (offset > entry->end)
- p = &(*p)->rb_right;
- else
- return parent;
- }
-
-do_insert:
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
/**
* Search @tree for an entry that contains @offset. Such entry would have
* entry->start <= offset && entry->end >= offset.
*
* @tree: the tree to search
* @offset: offset that should fall within an entry in @tree
- * @next_ret: pointer to the first entry whose range ends after @offset
- * @prev_ret: pointer to the first entry whose range begins before @offset
- * @p_ret: pointer where new node should be anchored (used when inserting an
+ * @node_ret: pointer where new node should be anchored (used when inserting an
* entry in the tree)
* @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
- * This function returns a pointer to the entry that contains @offset byte
- * address. If no such entry exists, then NULL is returned and the other
- * pointer arguments to the function are filled, otherwise the found entry is
- * returned and other pointers are left untouched.
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
*/
-static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **next_ret,
- struct rb_node **prev_ret,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
+static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***node_ret,
+ struct rb_node **parent_ret)
{
struct rb_root *root = &tree->state;
- struct rb_node **n = &root->rb_node;
+ struct rb_node **node = &root->rb_node;
struct rb_node *prev = NULL;
- struct rb_node *orig_prev = NULL;
struct tree_entry *entry;
- struct tree_entry *prev_entry = NULL;
- while (*n) {
- prev = *n;
+ while (*node) {
+ prev = *node;
entry = rb_entry(prev, struct tree_entry, rb_node);
- prev_entry = entry;
if (offset < entry->start)
- n = &(*n)->rb_left;
+ node = &(*node)->rb_left;
else if (offset > entry->end)
- n = &(*n)->rb_right;
+ node = &(*node)->rb_right;
else
- return *n;
+ return *node;
}
- if (p_ret)
- *p_ret = n;
+ if (node_ret)
+ *node_ret = node;
if (parent_ret)
*parent_ret = prev;
- if (next_ret) {
- orig_prev = prev;
- while (prev && offset > prev_entry->end) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *next_ret = prev;
- prev = orig_prev;
+ /* Search neighbors until we find the first one past the end */
+ while (prev && offset > entry->end) {
+ prev = rb_next(prev);
+ entry = rb_entry(prev, struct tree_entry, rb_node);
}
- if (prev_ret) {
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *prev_ret = prev;
- }
- return NULL;
+ return prev;
}
-static inline struct rb_node *
-tree_search_for_insert(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
{
- struct rb_node *next= NULL;
- struct rb_node *ret;
-
- ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
- if (!ret)
- return next;
- return ret;
+ return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
- u64 offset)
+/**
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret)
{
- return tree_search_for_insert(tree, offset, NULL, NULL);
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct tree_entry *entry;
+
+ ASSERT(prev_ret);
+ ASSERT(next_ret);
+
+ while (*node) {
+ prev = *node;
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return *node;
+ }
+
+ orig_prev = prev;
+ while (prev && offset > entry->end) {
+ prev = rb_next(prev);
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *next_ret = prev;
+ prev = orig_prev;
+
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+ while (prev && offset < entry->start) {
+ prev = rb_prev(prev);
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *prev_ret = prev;
+
+ return NULL;
}
/*
@@ -554,7 +536,7 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, u32 *bits,
+ struct extent_state *state, u32 bits,
struct extent_changeset *changeset);
/*
@@ -568,37 +550,56 @@ static void set_state_bits(struct extent_io_tree *tree,
* probably isn't what you want to call (see set/clear_extent_bit).
*/
static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state, u64 start, u64 end,
- struct rb_node ***p,
- struct rb_node **parent,
- u32 *bits, struct extent_changeset *changeset)
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
{
- struct rb_node *node;
-
- if (end < start) {
- btrfs_err(tree->fs_info,
- "insert state: end < start %llu %llu", end, start);
- WARN_ON(1);
- }
- state->start = start;
- state->end = end;
+ struct rb_node **node;
+ struct rb_node *parent;
+ const u64 end = state->end;
set_state_bits(tree, state, bits, changeset);
- node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
- if (node) {
- struct extent_state *found;
- found = rb_entry(node, struct extent_state, rb_node);
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- found->start, found->end, start, end);
- return -EEXIST;
+ node = &tree->state.rb_node;
+ while (*node) {
+ struct tree_entry *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
+ entry->start, entry->end, state->start, end);
+ return -EEXIST;
+ }
}
+
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+
merge_state(tree, state);
return 0;
}
/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+ struct extent_state *state, struct rb_node **node,
+ struct rb_node *parent, unsigned bits,
+ struct extent_changeset *changeset)
+{
+ set_state_bits(tree, state, bits, changeset);
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+ merge_state(tree, state);
+}
+
+/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
* offset inside 'orig' where it should be split.
@@ -615,7 +616,8 @@ static int insert_state(struct extent_io_tree *tree,
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
- struct rb_node *node;
+ struct rb_node *parent = NULL;
+ struct rb_node **node;
if (tree->private_data && is_data_inode(tree->private_data))
btrfs_split_delalloc_extent(tree->private_data, orig, split);
@@ -625,12 +627,27 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
prealloc->state = orig->state;
orig->start = split;
- node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
- &prealloc->rb_node, NULL, NULL);
- if (node) {
- free_extent_state(prealloc);
- return -EEXIST;
+ parent = &orig->rb_node;
+ node = &parent;
+ while (*node) {
+ struct tree_entry *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (prealloc->end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (prealloc->end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
}
+
+ rb_link_node(&prealloc->rb_node, parent, node);
+ rb_insert_color(&prealloc->rb_node, &tree->state);
+
return 0;
}
@@ -652,11 +669,11 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- u32 *bits, int wake,
+ u32 bits, int wake,
struct extent_changeset *changeset)
{
struct extent_state *next;
- u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
@@ -818,8 +835,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake,
- changeset);
+ state = clear_state_bit(tree, state, bits, wake, changeset);
goto next;
}
goto search_again;
@@ -840,13 +856,13 @@ hit_next:
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, &bits, wake, changeset);
+ clear_state_bit(tree, prealloc, bits, wake, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, &bits, wake, changeset);
+ state = clear_state_bit(tree, state, bits, wake, changeset);
next:
if (last_end == (u64)-1)
goto out;
@@ -937,9 +953,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- u32 *bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
- u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data && is_data_inode(tree->private_data))
@@ -1033,11 +1049,9 @@ again:
if (!node) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, changeset);
cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
@@ -1060,7 +1074,7 @@ hit_next:
goto out;
}
- set_state_bits(tree, state, &bits, changeset);
+ set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -1116,7 +1130,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits, changeset);
+ set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -1150,8 +1164,9 @@ hit_next:
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, changeset);
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -1179,7 +1194,7 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits, changeset);
+ set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
@@ -1274,10 +1289,9 @@ again:
err = -ENOMEM;
goto out;
}
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, NULL);
cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
@@ -1294,9 +1308,9 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
+ set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1335,10 +1349,9 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
+ set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0,
- NULL);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1372,8 +1385,9 @@ hit_next:
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, NULL);
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1398,9 +1412,9 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits, NULL);
+ set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
+ clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
prealloc = NULL;
goto out;
}
@@ -1674,7 +1688,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
/* Find first extent with bits cleared */
while (1) {
- node = __etree_search(tree, start, &next, &prev, NULL, NULL);
+ node = tree_search_prev_next(tree, start, &prev, &next);
if (!node && !next && !prev) {
/*
* Tree is completely empty, send full range and let
@@ -2727,18 +2741,31 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_subpage_end_reader(fs_info, page, start, len);
}
-static blk_status_t submit_data_read_repair(struct inode *inode,
- struct bio *failed_bio,
- u32 bio_offset, struct page *page,
- unsigned int pgoff,
- u64 start, u64 end,
- int failed_mirror,
- unsigned int error_bitmap)
+static void end_sector_io(struct page *page, u64 offset, bool uptodate)
{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct extent_state *cached = NULL;
+
+ end_page_read(page, uptodate, offset, sectorsize);
+ if (uptodate)
+ set_extent_uptodate(&inode->io_tree, offset,
+ offset + sectorsize - 1, &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(&inode->io_tree, offset,
+ offset + sectorsize - 1, &cached);
+}
+
+static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio,
+ u32 bio_offset, const struct bio_vec *bvec,
+ int failed_mirror, unsigned int error_bitmap)
+{
+ const unsigned int pgoff = bvec->bv_offset;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct page *page = bvec->bv_page;
+ const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
+ const u64 end = start + bvec->bv_len - 1;
const u32 sectorsize = fs_info->sectorsize;
const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
- int error = 0;
int i;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2758,7 +2785,6 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
/* Iterate through all the sectors in the range */
for (i = 0; i < nr_bits; i++) {
const unsigned int offset = i * sectorsize;
- struct extent_state *cached = NULL;
bool uptodate = false;
int ret;
@@ -2774,7 +2800,7 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
ret = btrfs_repair_one_sector(inode, failed_bio,
bio_offset + offset,
page, pgoff + offset, start + offset,
- failed_mirror, btrfs_submit_data_bio);
+ failed_mirror, btrfs_submit_data_read_bio);
if (!ret) {
/*
* We have submitted the read repair, the page release
@@ -2785,24 +2811,12 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
continue;
}
/*
- * Repair failed, just record the error but still continue.
- * Or the remaining sectors will not be properly unlocked.
+ * Continue on failed repair, otherwise the remaining sectors
+ * will not be properly unlocked.
*/
- if (!error)
- error = ret;
next:
- end_page_read(page, uptodate, start + offset, sectorsize);
- if (uptodate)
- set_extent_uptodate(&BTRFS_I(inode)->io_tree,
- start + offset,
- start + offset + sectorsize - 1,
- &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
- start + offset,
- start + offset + sectorsize - 1,
- &cached);
+ end_sector_io(page, start + offset, uptodate);
}
- return errno_to_blk_status(error);
}
/* lots and lots of room for performance fixes in the end_bio funcs */
@@ -3015,7 +3029,6 @@ static void end_bio_extent_readpage(struct bio *bio)
*/
u32 bio_offset = 0;
int mirror;
- int ret;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -3026,6 +3039,7 @@ static void end_bio_extent_readpage(struct bio *bio)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
unsigned int error_bitmap = (unsigned int)-1;
+ bool repair = false;
u64 start;
u64 end;
u32 len;
@@ -3063,57 +3077,23 @@ static void end_bio_extent_readpage(struct bio *bio)
if (is_data_inode(inode)) {
error_bitmap = btrfs_verify_data_csum(bbio,
bio_offset, page, start, end);
- ret = error_bitmap;
+ if (error_bitmap)
+ uptodate = false;
} else {
- ret = btrfs_validate_metadata_buffer(bbio,
- page, start, end, mirror);
+ if (btrfs_validate_metadata_buffer(bbio,
+ page, start, end, mirror))
+ uptodate = false;
}
- if (ret)
- uptodate = false;
- else
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, tree, start,
- page,
- btrfs_ino(BTRFS_I(inode)), 0);
}
- if (likely(uptodate))
- goto readpage_ok;
-
- if (is_data_inode(inode)) {
- /*
- * If we failed to submit the IO at all we'll have a
- * mirror_num == 0, in which case we need to just mark
- * the page with an error and unlock it and carry on.
- */
- if (mirror == 0)
- goto readpage_ok;
-
- /*
- * submit_data_read_repair() will handle all the good
- * and bad sectors, we just continue to the next bvec.
- */
- submit_data_read_repair(inode, bio, bio_offset, page,
- start - page_offset(page),
- start, end, mirror,
- error_bitmap);
-
- ASSERT(bio_offset + len > bio_offset);
- bio_offset += len;
- continue;
- } else {
- struct extent_buffer *eb;
-
- eb = find_extent_buffer_readpage(fs_info, page, start);
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- eb->read_mirror = mirror;
- atomic_dec(&eb->io_pages);
- }
-readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
+ clean_io_failure(BTRFS_I(inode)->root->fs_info,
+ failure_tree, tree, start, page,
+ btrfs_ino(BTRFS_I(inode)), 0);
+
/*
* Zero out the remaining part if this range straddles
* i_size.
@@ -3130,14 +3110,40 @@ readpage_ok:
zero_user_segment(page, zero_start,
offset_in_page(end) + 1);
}
+ } else if (is_data_inode(inode)) {
+ /*
+ * Only try to repair bios that actually made it to a
+ * device. If the bio failed to be submitted mirror
+ * is 0 and we need to fail it without retrying.
+ */
+ if (mirror > 0)
+ repair = true;
+ } else {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer_readpage(fs_info, page, start);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = mirror;
+ atomic_dec(&eb->io_pages);
}
+
+ if (repair) {
+ /*
+ * submit_data_read_repair() will handle all the good
+ * and bad sectors, we just continue to the next bvec.
+ */
+ submit_data_read_repair(inode, bio, bio_offset, bvec,
+ mirror, error_bitmap);
+ } else {
+ /* Update page status and unlock */
+ end_page_read(page, uptodate, start, len);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, PageUptodate(page));
+ }
+
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
- /* Update page status and unlock */
- end_page_read(page, uptodate, start, len);
- endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, PageUptodate(page));
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3206,19 +3212,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
return bio;
}
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio)
-{
- struct btrfs_bio *bbio;
- struct bio *new;
-
- /* Bio allocation backed by a bioset does not fail */
- new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset);
- bbio = btrfs_bio(new);
- btrfs_bio_init(bbio);
- bbio->iter = bio->bi_iter;
- return new;
-}
-
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
@@ -3378,7 +3371,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
bio_ctrl->bio = bio;
bio_ctrl->compress_type = compress_type;
bio->bi_end_io = end_io_func;
- bio->bi_private = &inode->io_tree;
bio->bi_opf = opf;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
@@ -3443,7 +3435,6 @@ static int submit_extent_page(unsigned int opf,
struct page *page, u64 disk_bytenr,
size_t size, unsigned long pg_offset,
bio_end_io_t end_io_func,
- int mirror_num,
enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
@@ -3455,10 +3446,8 @@ static int submit_extent_page(unsigned int opf,
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
- if (force_bio_submit && bio_ctrl->bio) {
- submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type);
- bio_ctrl->bio = NULL;
- }
+ if (force_bio_submit)
+ submit_one_bio(bio_ctrl);
while (cur < pg_offset + size) {
u32 offset = cur - pg_offset;
@@ -3498,8 +3487,7 @@ static int submit_extent_page(unsigned int opf,
if (added < size - offset) {
/* The bio should contain some page(s) */
ASSERT(bio_ctrl->bio->bi_iter.bi_size);
- submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type);
- bio_ctrl->bio = NULL;
+ submit_one_bio(bio_ctrl);
}
cur += added;
}
@@ -3647,7 +3635,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, iosize);
- flush_dcache_page(page);
}
}
begin_page_read(fs_info, page);
@@ -3662,7 +3649,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = PAGE_SIZE - pg_offset;
memzero_page(page, pg_offset, iosize);
- flush_dcache_page(page);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
@@ -3746,7 +3732,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct extent_state *cached = NULL;
memzero_page(page, pg_offset, iosize);
- flush_dcache_page(page);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
@@ -3779,10 +3764,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
bio_ctrl, page, disk_bytenr, iosize,
- pg_offset,
- end_bio_extent_readpage, 0,
- this_bio_flag,
- force_bio_submit);
+ pg_offset, end_bio_extent_readpage,
+ this_bio_flag, force_bio_submit);
if (ret) {
/*
* We have to unlock the remaining range, or the page
@@ -3815,8 +3798,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
* If btrfs_do_readpage() failed we will want to submit the assembled
* bio to do the cleanup.
*/
- if (bio_ctrl.bio)
- submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type);
+ submit_one_bio(&bio_ctrl);
return ret;
}
@@ -4099,7 +4081,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
disk_bytenr, iosize,
cur - page_offset(page),
end_bio_extent_writepage,
- 0, 0, false);
+ 0, false);
if (ret) {
has_error = true;
if (!saved_ret)
@@ -4164,10 +4146,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- if (page->index == end_index) {
+ if (page->index == end_index)
memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
- flush_dcache_page(page);
- }
ret = set_page_extent_mapped(page);
if (ret < 0) {
@@ -4276,7 +4256,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
flush = 1;
btrfs_tree_lock(eb);
}
@@ -4286,7 +4266,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!epd->sync_io)
return 0;
if (!flush) {
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
flush = 1;
}
while (1) {
@@ -4333,7 +4313,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
flush = 1;
}
lock_page(p);
@@ -4575,7 +4555,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
- unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ unsigned int write_flags = wbc_to_write_flags(wbc);
bool no_dirty_ebs = false;
int ret;
@@ -4594,7 +4574,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
&epd->bio_ctrl, page, eb->start, eb->len,
eb->start - page_offset(page),
- end_bio_subpage_eb_writepage, 0, 0, false);
+ end_bio_subpage_eb_writepage, 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
@@ -4620,7 +4600,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
{
u64 disk_bytenr = eb->start;
int i, num_pages;
- unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ unsigned int write_flags = wbc_to_write_flags(wbc);
int ret = 0;
prepare_eb_write(eb);
@@ -4635,7 +4615,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
&epd->bio_ctrl, p, disk_bytenr,
PAGE_SIZE, 0,
end_bio_extent_buffer_writepage,
- 0, 0, false);
+ 0, false);
if (ret) {
set_btree_ioerr(p, eb);
if (PageWriteback(p))
@@ -4749,7 +4729,7 @@ static int submit_eb_subpage(struct page *page,
cleanup:
/* We hit error, end bio for the submitted extent buffers */
- end_write_bio(epd, ret);
+ submit_write_bio(epd, ret);
return ret;
}
@@ -4928,10 +4908,6 @@ retry:
index = 0;
goto retry;
}
- if (ret < 0) {
- end_write_bio(&epd, ret);
- goto out;
- }
/*
* If something went wrong, don't allow any metadata write bio to be
* submitted.
@@ -4958,21 +4934,17 @@ retry:
* Now such dirty tree block will not be cleaned by any dirty
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
- */
- if (!BTRFS_FS_ERROR(fs_info)) {
- flush_write_bio(&epd);
- } else {
- ret = -EROFS;
- end_write_bio(&epd, ret);
- }
-out:
- btrfs_zoned_meta_io_unlock(fs_info);
- /*
+ *
* We can get ret > 0 from submit_extent_page() indicating how many ebs
* were submitted. Reset it to 0 to avoid false alerts for the caller.
*/
if (ret > 0)
ret = 0;
+ if (!ret && BTRFS_FS_ERROR(fs_info))
+ ret = -EROFS;
+ submit_write_bio(&epd, ret);
+
+ btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
@@ -5074,7 +5046,7 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
lock_page(page);
}
@@ -5085,7 +5057,7 @@ retry:
if (wbc->sync_mode != WB_SYNC_NONE) {
if (PageWriteback(page))
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
wait_on_page_writeback(page);
}
@@ -5125,7 +5097,7 @@ retry:
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
goto retry;
}
@@ -5136,26 +5108,6 @@ retry:
return ret;
}
-int extent_write_full_page(struct page *page, struct writeback_control *wbc)
-{
- int ret;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
- .extent_locked = 0,
- .sync_io = wbc->sync_mode == WB_SYNC_ALL,
- };
-
- ret = __extent_writepage(page, wbc, &epd);
- ASSERT(ret <= 0);
- if (ret < 0) {
- end_write_bio(&epd, ret);
- return ret;
- }
-
- flush_write_bio(&epd);
- return ret;
-}
-
/*
* Submit the pages in the range to bio for call sites which delalloc range has
* already been ran (aka, ordered extent inserted) and all pages are still
@@ -5213,10 +5165,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
cur = cur_end + 1;
}
- if (!found_error)
- flush_write_bio(&epd);
- else
- end_write_bio(&epd, ret);
+ submit_write_bio(&epd, found_error ? ret : 0);
wbc_detach_inode(&wbc_writepages);
if (found_error)
@@ -5241,14 +5190,10 @@ int extent_writepages(struct address_space *mapping,
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
- ASSERT(ret <= 0);
- if (ret < 0) {
- btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
- end_write_bio(&epd, ret);
- return ret;
- }
- flush_write_bio(&epd);
+
+ submit_write_bio(&epd, ret);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
+
return ret;
}
@@ -5270,9 +5215,7 @@ void extent_readahead(struct readahead_control *rac)
if (em_cached)
free_extent_map(em_cached);
-
- if (bio_ctrl.bio)
- submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type);
+ submit_one_bio(&bio_ctrl);
}
/*
@@ -6202,7 +6145,7 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
return -EINVAL;
}
if (fs_info->nodesize >= PAGE_SIZE &&
- !IS_ALIGNED(start, PAGE_SIZE)) {
+ !PAGE_ALIGNED(start)) {
btrfs_err(fs_info,
"tree block is not page aligned, start %llu nodesize %u",
start, fs_info->nodesize);
@@ -6599,7 +6542,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
struct page *page = eb->pages[0];
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
int ret = 0;
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
@@ -6631,11 +6576,10 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
- ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
+ ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
page, eb->start, eb->len,
eb->start - page_offset(page),
- end_bio_extent_readpage, mirror_num, 0,
- true);
+ end_bio_extent_readpage, 0, true);
if (ret) {
/*
* In the endio function, if we hit something wrong we will
@@ -6644,10 +6588,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
- if (bio_ctrl.bio) {
- submit_one_bio(bio_ctrl.bio, mirror_num, 0);
- bio_ctrl.bio = NULL;
- }
+ submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -6667,7 +6608,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
int all_uptodate = 1;
int num_pages;
unsigned long num_reads = 0;
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -6738,10 +6681,10 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ err = submit_extent_page(REQ_OP_READ, NULL,
&bio_ctrl, page, page_offset(page),
PAGE_SIZE, 0, end_bio_extent_readpage,
- mirror_num, 0, false);
+ 0, false);
if (err) {
/*
* We failed to submit the bio so it's the
@@ -6758,10 +6701,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
}
- if (bio_ctrl.bio) {
- submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type);
- bio_ctrl.bio = NULL;
- }
+ submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
return ret;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 23d4103c8831..a76c6ef74cd3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -142,15 +142,10 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
struct extent_map_tree;
-typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
-
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int btrfs_read_folio(struct file *file, struct folio *folio);
-int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
@@ -247,7 +242,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9dfde1af8a64..89c6d7ff1987 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2308,7 +2308,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
/* we've logged all the items and now have a consistent
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 05e0c4a5affd..898a69ca2bb9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,7 +114,6 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -125,10 +124,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 ram_bytes, int compress_type,
int type);
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate);
-
/*
* btrfs_inode_lock - lock inode i_rwsem based on arguments passed
*
@@ -195,11 +190,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start = page_offset(locked_page);
- u64 page_end = page_start + PAGE_SIZE - 1;
-
+ u64 page_start, page_end;
struct page *page;
+ if (locked_page) {
+ page_start = page_offset(locked_page);
+ page_end = page_start + PAGE_SIZE - 1;
+ }
+
while (index <= end_index) {
/*
* For locked page, we will call end_extent_writepage() on it
@@ -212,7 +210,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* btrfs_mark_ordered_io_finished() would skip the accounting
* for the page range, and the ordered extent will never finish.
*/
- if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+ if (locked_page && index == (page_start >> PAGE_SHIFT)) {
index++;
continue;
}
@@ -223,7 +221,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
/*
* Here we just clear all Ordered bits for every page in the
- * range, then __endio_write_update_ordered() will handle
+ * range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
@@ -231,20 +229,23 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
put_page(page);
}
- /* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
- return;
- /*
- * In case this page belongs to the delalloc range being instantiated
- * then skip it, since the first page of a range is going to be
- * properly cleaned up by the caller of run_delalloc_range
- */
- if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
- offset = page_offset(locked_page) + PAGE_SIZE;
+ if (locked_page) {
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_start + PAGE_SIZE)
+ return;
+ /*
+ * In case this page belongs to the delalloc range being
+ * instantiated then skip it, since the first page of a range is
+ * going to be properly cleaned up by the caller of
+ * run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
+ }
}
- return __endio_write_update_ordered(inode, offset, bytes, false);
+ return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -560,8 +561,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
* will unlock the full page.
*/
if (fs_info->sectorsize < PAGE_SIZE) {
- if (!IS_ALIGNED(start, PAGE_SIZE) ||
- !IS_ALIGNED(end + 1, PAGE_SIZE))
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(end + 1))
return 0;
}
@@ -678,8 +679,8 @@ again:
* Thus we must also check against @actual_end, not just @end.
*/
if (blocksize < PAGE_SIZE) {
- if (!IS_ALIGNED(start, PAGE_SIZE) ||
- !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(round_up(actual_end, blocksize)))
goto cleanup_and_bail_uncompressed;
}
@@ -927,8 +928,18 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
goto out;
}
if (ret < 0) {
- if (locked_page)
+ btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
+ if (locked_page) {
+ const u64 page_start = page_offset(locked_page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
+
+ btrfs_page_set_error(inode->root->fs_info, locked_page,
+ page_start, PAGE_SIZE);
+ set_page_writeback(locked_page);
+ end_page_writeback(locked_page);
+ end_extent_writepage(locked_page, ret, page_start, page_end);
unlock_page(locked_page);
+ }
goto out;
}
@@ -1133,6 +1144,28 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* *page_started is set to one if we unlock locked_page and do everything
* required to start IO on it. It may be clean and already done with
* IO when we return.
+ *
+ * When unlock == 1, we unlock the pages in successfully allocated regions.
+ * When unlock == 0, we leave them locked for writing them out.
+ *
+ * However, we unlock all the pages except @locked_page in case of failure.
+ *
+ * In summary, page locking state will be as follow:
+ *
+ * - page_started == 1 (return value)
+ * - All the pages are unlocked. IO is started.
+ * - Note that this can happen only on success
+ * - unlock == 1
+ * - All the pages except @locked_page are unlocked in any case
+ * - unlock == 0
+ * - On success, all the pages are locked for writing out them
+ * - On failure, all the pages except @locked_page are unlocked
+ *
+ * When a failure happens in the second or later iteration of the
+ * while-loop, the ordered extents created in previous iterations are kept
+ * intact. So, the caller must clean them up by calling
+ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
+ * example.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
@@ -1142,6 +1175,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
+ u64 orig_start = start;
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
@@ -1329,18 +1363,47 @@ out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
+ /*
+ * Now, we have three regions to clean up:
+ *
+ * |-------(1)----|---(2)---|-------------(3)----------|
+ * `- orig_start `- start `- start + cur_alloc_size `- end
+ *
+ * We process each region below.
+ */
+
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
/*
- * If we reserved an extent for our delalloc range (or a subrange) and
- * failed to create the respective ordered extent, then it means that
- * when we reserved the extent we decremented the extent's size from
- * the data space_info's bytes_may_use counter and incremented the
- * space_info's bytes_reserved counter by the same amount. We must make
- * sure extent_clear_unlock_delalloc() does not try to decrement again
- * the data space_info's bytes_may_use counter, therefore we do not pass
- * it the flag EXTENT_CLEAR_DATA_RESV.
+ * For the range (1). We have already instantiated the ordered extents
+ * for this region. They are cleaned up by
+ * btrfs_cleanup_ordered_extents() in e.g,
+ * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
+ * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
+ * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
+ * function.
+ *
+ * However, in case of unlock == 0, we still need to unlock the pages
+ * (except @locked_page) to ensure all the pages are unlocked.
+ */
+ if (!unlock && orig_start < start) {
+ if (!locked_page)
+ mapping_set_error(inode->vfs_inode.i_mapping, ret);
+ extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+ locked_page, 0, page_ops);
+ }
+
+ /*
+ * For the range (2). If we reserved an extent for our delalloc range
+ * (or a subrange) and failed to create the respective ordered extent,
+ * then it means that when we reserved the extent we decremented the
+ * extent's size from the data space_info's bytes_may_use counter and
+ * incremented the space_info's bytes_reserved counter by the same
+ * amount. We must make sure extent_clear_unlock_delalloc() does not try
+ * to decrement again the data space_info's bytes_may_use counter,
+ * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
@@ -1350,12 +1413,19 @@ out_unlock:
page_ops);
start += cur_alloc_size;
if (start >= end)
- goto out;
+ return ret;
}
+
+ /*
+ * For the range (3). We never touched the region. In addition to the
+ * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
+ * space_info's bytes_may_use counter, reserved in
+ * btrfs_check_data_free_space().
+ */
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits | EXTENT_CLEAR_DATA_RESV,
page_ops);
- goto out;
+ return ret;
}
/*
@@ -2274,18 +2344,18 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* list of inodes that have pending delalloc work to be done.
*/
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits)
+ u32 bits)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
@@ -2303,7 +2373,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
- if (*bits & EXTENT_DEFRAG)
+ if (bits & EXTENT_DEFRAG)
BTRFS_I(inode)->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags))
@@ -2312,7 +2382,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
state->start;
@@ -2325,14 +2395,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* accounting happens.
*/
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
- struct extent_state *state, unsigned *bits)
+ struct extent_state *state, u32 bits)
{
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
- if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
+ if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
spin_unlock(&inode->lock);
@@ -2343,7 +2413,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
bool do_list = !btrfs_is_free_space_inode(inode);
@@ -2356,7 +2426,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* don't need to call delalloc_release_metadata if there is an
* error.
*/
- if (*bits & EXTENT_CLEAR_META_RESV &&
+ if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len, false);
@@ -2366,7 +2436,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
- (*bits & EXTENT_CLEAR_DATA_RESV))
+ (bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
@@ -2381,11 +2451,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&inode->lock);
ASSERT(inode->new_delalloc_bytes >= len);
inode->new_delalloc_bytes -= len;
- if (*bits & EXTENT_ADD_INODE_BYTES)
+ if (bits & EXTENT_ADD_INODE_BYTES)
inode_add_bytes(&inode->vfs_inode, len);
spin_unlock(&inode->lock);
}
@@ -2580,95 +2650,75 @@ out:
return errno_to_blk_status(ret);
}
-/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read: sync submit
- *
- * b) write without checksum: sync submit
- *
- * c) write with checksum:
- * c-1) if bio is issued by fsync: sync submit
- * (sync_writers != 0)
- *
- * c-2) if root is reloc root: sync submit
- * (only in case of buffered IO)
- *
- * c-3) otherwise: async submit
- */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type)
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
- blk_status_t ret = 0;
- int skip_sum;
- int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
- metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ blk_status_t ret;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct page *page = bio_first_bvec_all(bio)->bv_page;
- loff_t file_offset = page_offset(page);
-
- ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+ ret = extract_ordered_extent(bi, bio,
+ page_offset(bio_first_bvec_all(bio)->bv_page));
if (ret)
goto out;
}
- if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
- if (ret)
- goto out;
-
- if (compress_type != BTRFS_COMPRESS_NONE) {
- /*
- * btrfs_submit_compressed_read will handle completing
- * the bio if there were any errors, so just return
- * here.
- */
- btrfs_submit_compressed_read(inode, bio, mirror_num);
+ /*
+ * If we need to checksum, and the I/O is not issued by fsync and
+ * friends, that is ->sync_writers != 0, defer the submission to a
+ * workqueue to parallelize it.
+ *
+ * Csum items for reloc roots have already been cloned at this point,
+ * so they are handled as part of the no-checksum case.
+ */
+ if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !btrfs_is_data_reloc_root(bi->root)) {
+ if (!atomic_read(&bi->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ btrfs_submit_bio_start))
return;
- } else {
- /*
- * Lookup bio sums does extra checks around whether we
- * need to csum or not, which is why we ignore skip_sum
- * here.
- */
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
- if (ret)
- goto out;
- }
- goto mapit;
- } else if (async && !skip_sum) {
- /* csum items have already been cloned */
- if (btrfs_is_data_reloc_root(root))
- goto mapit;
- /* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num,
- 0, btrfs_submit_bio_start);
- goto out;
- } else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
+
+ ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
if (ret)
goto out;
}
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
+out:
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ }
+}
-mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ blk_status_t ret;
-out:
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ /*
+ * btrfs_submit_compressed_read will handle completing the bio
+ * if there were any errors, so just return here.
+ */
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
+ }
+
+ /*
+ * Lookup bio sums does extra checks around whether we need to csum or
+ * not, which is why we ignore skip_sum here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
+ return;
}
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
/*
@@ -3102,7 +3152,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
struct btrfs_root *root = inode->root;
@@ -3311,21 +3361,36 @@ out:
return ret;
}
-static void finish_ordered_fn(struct btrfs_work *work)
-{
- struct btrfs_ordered_extent *ordered_extent;
- ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
- btrfs_finish_ordered_io(ordered_extent);
-}
-
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
- btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
- finish_ordered_fn, uptodate);
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
+}
+
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ */
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected)
+{
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ char *kaddr;
+
+ ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
+
+ shash->tfm = fs_info->csum_shash;
+
+ kaddr = kmap_local_page(page) + pgoff;
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+ kunmap_local(kaddr);
+
+ if (memcmp(csum, csum_expected, fs_info->csum_size))
+ return -EIO;
+ return 0;
}
/*
@@ -3338,35 +3403,27 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
* @start: logical offset in the file
*
* The length of such check is always one sector size.
+ *
+ * When csum mismatch is detected, we will also report the error and fill the
+ * corrupted range with zero. (Thus it needs the extra parameters)
*/
static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff,
u64 start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
u32 len = fs_info->sectorsize;
- const u32 csum_size = fs_info->csum_size;
- unsigned int offset_sectors;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
ASSERT(pgoff + len <= PAGE_SIZE);
- offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
+ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
- kaddr = kmap_atomic(page);
- shash->tfm = fs_info->csum_shash;
-
- crypto_shash_digest(shash, kaddr + pgoff, len, csum);
- kunmap_atomic(kaddr);
-
- if (memcmp(csum, csum_expected, csum_size))
+ if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
goto zeroit;
-
return 0;
+
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
bbio->mirror_num);
@@ -4229,7 +4286,7 @@ skip_backref:
/*
* If we are in a rename context, we don't need to update anything in the
* log. That will be done later during the rename by btrfs_log_new_name().
- * Besides that, doing it here would only cause extra unncessary btree
+ * Besides that, doing it here would only cause extra unnecessary btree
* operations on the log tree, increasing latency for applications.
*/
if (!rename_ctx) {
@@ -4859,7 +4916,6 @@ again:
else
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
- flush_dcache_page(page);
}
btrfs_page_clear_checked(fs_info, page, block_start,
block_end + 1 - block_start);
@@ -6367,7 +6423,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(path);
+ /*
+ * We don't need the path anymore, plus inheriting properties, adding
+ * ACLs, security xattrs, orphan item or adding the link, will result in
+ * allocating yet another path. So just free our path.
+ */
+ btrfs_free_path(path);
+ path = NULL;
if (args->subvol) {
struct inode *parent;
@@ -6424,8 +6486,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
goto discard;
}
- ret = 0;
- goto out;
+ return 0;
discard:
/*
@@ -7589,8 +7650,12 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
const u64 data_alloc_len = length;
bool unlock_extents = false;
+ /*
+ * Cap the size of reads to that usually seen in buffered I/O as we need
+ * to allocate a contiguous array for the checksums.
+ */
if (!write)
- len = min_t(u64, len, fs_info->sectorsize);
+ len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
lockstart = start;
lockend = start + len - 1;
@@ -7813,8 +7878,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
pos += submitted;
length -= submitted;
if (write)
- __endio_write_update_ordered(BTRFS_I(inode), pos,
- length, false);
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+ pos, length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1);
@@ -7836,10 +7901,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
return;
if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
- __endio_write_update_ordered(BTRFS_I(dip->inode),
- dip->file_offset,
- dip->bytes,
- !dip->bio.bi_status);
+ btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+ dip->file_offset, dip->bytes,
+ !dip->bio.bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
dip->file_offset,
@@ -7859,12 +7923,8 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
- return;
-
refcount_inc(&dip->refs);
- if (btrfs_map_bio(fs_info, bio, mirror_num))
- refcount_dec(&dip->refs);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
@@ -7873,56 +7933,36 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
{
struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- const u32 sectorsize = fs_info->sectorsize;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
- struct bio_vec bvec;
- struct bvec_iter iter;
- u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
+
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
+
+ if (uptodate &&
+ (!csum || !check_data_csum(inode, bbio, offset, bv.bv_page,
+ bv.bv_offset, start))) {
+ clean_io_failure(fs_info, failure_tree, io_tree, start,
+ bv.bv_page, btrfs_ino(BTRFS_I(inode)),
+ bv.bv_offset);
+ } else {
+ int ret;
- __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
- unsigned int i, nr_sectors, pgoff;
-
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
- pgoff = bvec.bv_offset;
- for (i = 0; i < nr_sectors; i++) {
- u64 start = bbio->file_offset + bio_offset;
-
- ASSERT(pgoff < PAGE_SIZE);
- if (uptodate &&
- (!csum || !check_data_csum(inode, bbio,
- bio_offset, bvec.bv_page,
- pgoff, start))) {
- clean_io_failure(fs_info, failure_tree, io_tree,
- start, bvec.bv_page,
- btrfs_ino(BTRFS_I(inode)),
- pgoff);
- } else {
- int ret;
-
- ret = btrfs_repair_one_sector(inode, &bbio->bio,
- bio_offset, bvec.bv_page, pgoff,
- start, bbio->mirror_num,
- submit_dio_repair_bio);
- if (ret)
- err = errno_to_blk_status(ret);
- }
- ASSERT(bio_offset + sectorsize > bio_offset);
- bio_offset += sectorsize;
- pgoff += sectorsize;
+ ret = btrfs_repair_one_sector(inode, &bbio->bio, offset,
+ bv.bv_page, bv.bv_offset, start,
+ bbio->mirror_num,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
}
}
- return err;
-}
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate)
-{
- btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
- finish_ordered_fn, uptodate);
+ return err;
}
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
@@ -7957,51 +7997,39 @@ static void btrfs_end_dio_bio(struct bio *bio)
btrfs_dio_private_put(dip);
}
-static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
- struct inode *inode, u64 file_offset, int async_submit)
+static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+ u64 file_offset, int async_submit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
- bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
blk_status_t ret;
- /* Check btrfs_submit_bio_hook() for rules about async submit. */
- if (async_submit)
- async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- if (!write) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- goto err;
- }
-
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
goto map;
- if (write && async_submit) {
- ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset,
- btrfs_submit_bio_start_direct_io);
- goto err;
- } else if (write) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ /* Check btrfs_submit_data_write_bio() for async submit rules */
+ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+ btrfs_submit_bio_start_direct_io))
+ return;
+
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
- if (ret)
- goto err;
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ return;
+ }
} else {
- u64 csum_offset;
-
- csum_offset = file_offset - dip->file_offset;
- csum_offset >>= fs_info->sectorsize_bits;
- csum_offset *= fs_info->csum_size;
- btrfs_bio(bio)->csum = dip->csums + csum_offset;
+ btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
+ file_offset - dip->file_offset);
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0);
-err:
- return ret;
+ btrfs_submit_bio(fs_info, bio, 0);
}
static void btrfs_submit_direct(const struct iomap_iter *iter,
@@ -8114,14 +8142,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
async_submit = 1;
}
- status = btrfs_submit_dio_bio(bio, inode, file_offset,
- async_submit);
- if (status) {
- bio_put(bio);
- if (submit_len > 0)
- refcount_dec(&dip->refs);
- goto out_err_em;
- }
+ btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
dio_data->submitted += clone_len;
clone_offset += clone_len;
@@ -8169,31 +8190,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- int ret;
-
- if (current->flags & PF_MEMALLOC) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
-
- /*
- * If we are under memory pressure we will call this directly from the
- * VM, we need to make sure we have the inode referenced for the ordered
- * extent. If not just return like we didn't do anything.
- */
- if (!igrab(inode)) {
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
- ret = extent_write_full_page(page, wbc);
- btrfs_add_delayed_iput(inode);
- return ret;
-}
-
static int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -8257,30 +8253,24 @@ static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
}
#ifdef CONFIG_MIGRATION
-static int btrfs_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page,
+static int btrfs_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
- int ret;
+ int ret = filemap_migrate_folio(mapping, dst, src, mode);
- ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (page_has_private(page))
- attach_page_private(newpage, detach_page_private(page));
-
- if (PageOrdered(page)) {
- ClearPageOrdered(page);
- SetPageOrdered(newpage);
+ if (folio_test_ordered(src)) {
+ folio_clear_ordered(src);
+ folio_set_ordered(dst);
}
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
+#else
+#define btrfs_migrate_folio NULL
#endif
static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
@@ -8497,7 +8487,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* Reserving delalloc space after obtaining the page lock can lead to
* deadlock. For example, if a dirty page is locked by this function
* and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepage() function could
+ * dirty page write out, then the btrfs_writepages() function could
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
@@ -8588,10 +8578,9 @@ again:
else
zero_start = PAGE_SIZE;
- if (zero_start != PAGE_SIZE) {
+ if (zero_start != PAGE_SIZE)
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
- flush_dcache_page(page);
- }
+
btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
@@ -9549,15 +9538,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
+ int ret;
+
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
- new_dentry);
+ ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+ new_dentry);
+ else
+ ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
+
+ btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
- return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
- new_dentry, flags);
+ return ret;
}
struct btrfs_delalloc_work {
@@ -10177,9 +10172,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
}
}
-static int btrfs_encoded_io_compression_from_extent(
- struct btrfs_fs_info *fs_info,
- int compress_type)
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type)
{
switch (compress_type) {
case BTRFS_COMPRESS_NONE:
@@ -10302,7 +10296,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
struct bio *bio, int mirror_num)
{
struct btrfs_encoded_read_private *priv = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
@@ -10312,19 +10305,9 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
return ret;
}
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret) {
- btrfs_bio_free_csum(bbio);
- return ret;
- }
-
atomic_inc(&priv->pending);
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- if (ret) {
- atomic_dec(&priv->pending);
- btrfs_bio_free_csum(bbio);
- }
- return ret;
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return BLK_STS_OK;
}
static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
@@ -10384,11 +10367,9 @@ static void btrfs_encoded_read_endio(struct bio *bio)
bio_put(bio);
}
-static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset,
- u64 disk_bytenr,
- u64 disk_io_size,
- struct page **pages)
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size, struct page **pages)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_encoded_read_private priv = {
@@ -10782,15 +10763,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = -ENOMEM;
goto out_pages;
}
- kaddr = kmap(pages[i]);
+ kaddr = kmap_local_page(pages[i]);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
- kunmap(pages[i]);
+ kunmap_local(kaddr);
ret = -EFAULT;
goto out_pages;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
- kunmap(pages[i]);
+ kunmap_local(kaddr);
}
for (;;) {
@@ -11419,15 +11400,12 @@ static const struct file_operations btrfs_dir_file_operations = {
*/
static const struct address_space_operations btrfs_aops = {
.read_folio = btrfs_read_folio,
- .writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
.invalidate_folio = btrfs_invalidate_folio,
.release_folio = btrfs_release_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = btrfs_migratepage,
-#endif
+ .migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0f79af919bc4..7e1b4b0fbd6c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4243,26 +4243,6 @@ out:
return ret;
}
-static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
-{
- struct btrfs_data_container *inodes = ctx;
- const size_t c = 3 * sizeof(u64);
-
- if (inodes->bytes_left >= c) {
- inodes->bytes_left -= c;
- inodes->val[inodes->elem_cnt] = inum;
- inodes->val[inodes->elem_cnt + 1] = offset;
- inodes->val[inodes->elem_cnt + 2] = root;
- inodes->elem_cnt += 3;
- } else {
- inodes->bytes_missing += c - inodes->bytes_left;
- inodes->bytes_left = 0;
- inodes->elem_missed += 3;
- }
-
- return 0;
-}
-
static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
void __user *arg, int version)
{
@@ -4312,7 +4292,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
}
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- build_ino_list, inodes, ignore_offset);
+ inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4355,13 +4335,79 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->balance_lock);
}
+/**
+ * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
+ * required.
+ *
+ * @fs_info: the filesystem
+ * @excl_acquired: ptr to boolean value which is set to false in case balance
+ * is being resumed
+ *
+ * Return 0 on success in which case both fs_info::balance is acquired as well
+ * as exclusive ops are blocked. In case of failure return an error code.
+ */
+static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired)
+{
+ int ret;
+
+ /*
+ * Exclusive operation is locked. Three possibilities:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
+ while (1) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ *excl_acquired = true;
+ mutex_lock(&fs_info->balance_mutex);
+ return 0;
+ }
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* This is either (2) or (3) */
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (2) */
+ ret = -EINPROGRESS;
+ goto out_failure;
+
+ } else {
+ mutex_unlock(&fs_info->balance_mutex);
+ /*
+ * Lock released to allow other waiters to
+ * continue, we'll reexamine the status again.
+ */
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (3) */
+ *excl_acquired = false;
+ return 0;
+ }
+ }
+ } else {
+ /* This is (1) */
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out_failure;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ }
+
+out_failure:
+ mutex_unlock(&fs_info->balance_mutex);
+ *excl_acquired = false;
+ return ret;
+}
+
static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
- bool need_unlock; /* for mut. excl. ops lock */
+ bool need_unlock = true;
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -4378,53 +4424,12 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
goto out;
}
-again:
- if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
- mutex_lock(&fs_info->balance_mutex);
- need_unlock = true;
- goto locked;
- }
-
- /*
- * mut. excl. ops lock is locked. Three possibilities:
- * (1) some other op is running
- * (2) balance is running
- * (3) balance is paused -- special case (think resume)
- */
- mutex_lock(&fs_info->balance_mutex);
- if (fs_info->balance_ctl) {
- /* this is either (2) or (3) */
- if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- mutex_unlock(&fs_info->balance_mutex);
- /*
- * Lock released to allow other waiters to continue,
- * we'll reexamine the status again.
- */
- mutex_lock(&fs_info->balance_mutex);
-
- if (fs_info->balance_ctl &&
- !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- /* this is (3) */
- need_unlock = false;
- goto locked;
- }
-
- mutex_unlock(&fs_info->balance_mutex);
- goto again;
- } else {
- /* this is (2) */
- mutex_unlock(&fs_info->balance_mutex);
- ret = -EINPROGRESS;
- goto out;
- }
- } else {
- /* this is (1) */
- mutex_unlock(&fs_info->balance_mutex);
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ ret = btrfs_try_lock_balance(fs_info, &need_unlock);
+ if (ret)
goto out;
- }
-locked:
+ lockdep_assert_held(&fs_info->balance_mutex);
+
if (bargs->flags & BTRFS_BALANCE_RESUME) {
if (!fs_info->balance_ctl) {
ret = -ENOTCONN;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 430ad36b8b08..89bc5f825e0a 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -155,7 +155,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
out_pages[*cur_out / PAGE_SIZE] = cur_page;
}
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
write_compress_length(kaddr + offset_in_page(*cur_out),
compressed_size);
*cur_out += LZO_LEN;
@@ -167,7 +167,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
orig_out + compressed_size - *cur_out);
- kunmap(cur_page);
+ kunmap_local(kaddr);
if ((*cur_out / PAGE_SIZE) >= max_nr_page)
return -E2BIG;
@@ -180,7 +180,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
}
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
memcpy(kaddr + offset_in_page(*cur_out),
compressed_data + *cur_out - orig_out, copy_len);
@@ -202,7 +202,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
*cur_out += sector_bytes_left;
out:
- kunmap(cur_page);
+ kunmap_local(kaddr);
return 0;
}
@@ -248,12 +248,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap(page_in);
+ data_in = kmap_local_page(page_in);
ret = lzo1x_1_compress(data_in +
offset_in_page(cur_in), in_len,
workspace->cbuf, &out_len,
workspace->mem);
- kunmap(page_in);
+ kunmap_local(data_in);
if (ret < 0) {
pr_debug("BTRFS: lzo in loop returned %d\n", ret);
ret = -EIO;
@@ -310,7 +310,6 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- char *kaddr;
struct page *cur_page;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
@@ -318,11 +317,8 @@ static void copy_compressed_segment(struct compressed_bio *cb,
ASSERT(copy_len);
cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
- kaddr = kmap(cur_page);
- memcpy(dest + *cur_in - orig_in,
- kaddr + offset_in_page(*cur_in),
- copy_len);
- kunmap(cur_page);
+ memcpy_from_page(dest + *cur_in - orig_in, cur_page,
+ offset_in_page(*cur_in), copy_len);
*cur_in += copy_len;
}
@@ -342,9 +338,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- kaddr = kmap(cb->compressed_pages[0]);
+ kaddr = kmap_local_page(cb->compressed_pages[0]);
len_in = read_compress_length(kaddr);
- kunmap(cb->compressed_pages[0]);
+ kunmap_local(kaddr);
cur_in += LZO_LEN;
/*
@@ -378,9 +374,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
ASSERT(cur_page);
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
- kunmap(cur_page);
+ kunmap_local(kaddr);
cur_in += LZO_LEN;
if (seg_len > WORKSPACE_CBUF_LENGTH) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1957b14b329a..1952ac85222c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -272,25 +272,30 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
spin_unlock_irq(&tree->lock);
}
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered_extent;
+
+ ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+ btrfs_finish_ordered_io(ordered_extent);
+}
+
/*
* Mark all ordered extents io inside the specified range finished.
*
- * @page: The invovled page for the opeartion.
+ * @page: The involved page for the operation.
* For uncompressed buffered IO, the page status also needs to be
* updated to indicate whether the pending ordered io is finished.
* Can be NULL for direct IO and compressed write.
* For these cases, callers are ensured they won't execute the
* endio function twice.
- * @finish_func: The function to be executed when all the IO of an ordered
- * extent are finished.
*
* This function is called for endio, thus the range must have ordered
- * extent(s) coveri it.
+ * extent(s) covering it.
*/
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
- struct page *page, u64 file_offset,
- u64 num_bytes, btrfs_func_t finish_func,
- bool uptodate)
+ struct page *page, u64 file_offset,
+ u64 num_bytes, bool uptodate)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -401,8 +406,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
cond_wake_up(&entry->wait);
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_mark_finished(inode, entry);
spin_unlock_irqrestore(&tree->lock, flags);
- btrfs_init_work(&entry->work, finish_func, NULL, NULL);
+ btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
btrfs_queue_work(wq, &entry->work);
spin_lock_irqsave(&tree->lock, flags);
}
@@ -473,6 +479,7 @@ out:
if (finished && cached && entry) {
*cached = entry;
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
spin_unlock_irqrestore(&tree->lock, flags);
return finished;
@@ -807,8 +814,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
entry = NULL;
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup(inode, entry);
+ }
out:
spin_unlock_irqrestore(&tree->lock, flags);
return entry;
@@ -848,8 +857,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
break;
}
out:
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_range(inode, entry);
+ }
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -878,6 +889,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
ASSERT(list_empty(&ordered->log_list));
list_add_tail(&ordered->log_list, list);
refcount_inc(&ordered->refs);
+ trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
spin_unlock_irq(&tree->lock);
}
@@ -901,6 +913,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
spin_unlock_irq(&tree->lock);
return entry;
@@ -975,8 +988,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
/* No ordered extent in the range */
entry = NULL;
out:
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
+ }
+
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -1055,6 +1071,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret = 0;
+ trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+
spin_lock_irq(&tree->lock);
/* Remove from tree once */
node = &ordered->rb_node;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ecad67a2c745..87792f85e2c4 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -180,13 +180,14 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
t->last = NULL;
}
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
+
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
- u64 num_bytes, btrfs_func_t finish_func,
- bool uptodate);
+ u64 num_bytes, bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a5b623ee6fac..1afe32d5ab01 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -63,137 +63,6 @@ struct sector_ptr {
unsigned int uptodate:8;
};
-enum btrfs_rbio_ops {
- BTRFS_RBIO_WRITE,
- BTRFS_RBIO_READ_REBUILD,
- BTRFS_RBIO_PARITY_SCRUB,
- BTRFS_RBIO_REBUILD_MISSING,
-};
-
-struct btrfs_raid_bio {
- struct btrfs_io_context *bioc;
-
- /* while we're doing rmw on a stripe
- * we put it into a hash table so we can
- * lock the stripe and merge more rbios
- * into it.
- */
- struct list_head hash_list;
-
- /*
- * LRU list for the stripe cache
- */
- struct list_head stripe_cache;
-
- /*
- * for scheduling work in the helper threads
- */
- struct work_struct work;
-
- /*
- * bio list and bio_list_lock are used
- * to add more bios into the stripe
- * in hopes of avoiding the full rmw
- */
- struct bio_list bio_list;
- spinlock_t bio_list_lock;
-
- /* also protected by the bio_list_lock, the
- * plug list is used by the plugging code
- * to collect partial bios while plugged. The
- * stripe locking code also uses it to hand off
- * the stripe lock to the next pending IO
- */
- struct list_head plug_list;
-
- /*
- * flags that tell us if it is safe to
- * merge with this bio
- */
- unsigned long flags;
-
- /*
- * set if we're doing a parity rebuild
- * for a read from higher up, which is handled
- * differently from a parity rebuild as part of
- * rmw
- */
- enum btrfs_rbio_ops operation;
-
- /* Size of each individual stripe on disk */
- u32 stripe_len;
-
- /* How many pages there are for the full stripe including P/Q */
- u16 nr_pages;
-
- /* How many sectors there are for the full stripe including P/Q */
- u16 nr_sectors;
-
- /* Number of data stripes (no p/q) */
- u8 nr_data;
-
- /* Numer of all stripes (including P/Q) */
- u8 real_stripes;
-
- /* How many pages there are for each stripe */
- u8 stripe_npages;
-
- /* How many sectors there are for each stripe */
- u8 stripe_nsectors;
-
- /* First bad stripe, -1 means no corruption */
- s8 faila;
-
- /* Second bad stripe (for RAID6 use) */
- s8 failb;
-
- /* Stripe number that we're scrubbing */
- u8 scrubp;
-
- /*
- * size of all the bios in the bio_list. This
- * helps us decide if the rbio maps to a full
- * stripe or not
- */
- int bio_list_bytes;
-
- int generic_bio_cnt;
-
- refcount_t refs;
-
- atomic_t stripes_pending;
-
- atomic_t error;
- /*
- * these are two arrays of pointers. We allocate the
- * rbio big enough to hold them both and setup their
- * locations when the rbio is allocated
- */
-
- /* pointers to pages that we allocated for
- * reading/writing stripes directly from the disk (including P/Q)
- */
- struct page **stripe_pages;
-
- /* Pointers to the sectors in the bio_list, for faster lookup */
- struct sector_ptr *bio_sectors;
-
- /*
- * For subpage support, we need to map each sector to above
- * stripe_pages.
- */
- struct sector_ptr *stripe_sectors;
-
- /* Bitmap to record which horizontal stripe has data */
- unsigned long *dbitmap;
-
- /* allocated with real_stripes-many pointers for finish_*() calls */
- void **finish_pointers;
-
- /* Allocated with stripe_nsectors-many bits for finish_*() calls */
- unsigned long *finish_pbitmap;
-};
-
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
static void rmw_work(struct work_struct *work);
@@ -347,6 +216,24 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
}
}
+static void steal_rbio_page(struct btrfs_raid_bio *src,
+ struct btrfs_raid_bio *dest, int page_nr)
+{
+ const u32 sectorsize = src->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ if (dest->stripe_pages[page_nr])
+ __free_page(dest->stripe_pages[page_nr]);
+ dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
+ src->stripe_pages[page_nr] = NULL;
+
+ /* Also update the sector->uptodate bits. */
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page; i++)
+ dest->stripe_sectors[i].uptodate = true;
+}
+
/*
* Stealing an rbio means taking all the uptodate pages from the stripe array
* in the source rbio and putting them into the destination rbio.
@@ -358,7 +245,6 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
int i;
struct page *s;
- struct page *d;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
@@ -368,12 +254,7 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
if (!s || !full_page_sectors_uptodate(src, i))
continue;
- d = dest->stripe_pages[i];
- if (d)
- __free_page(d);
-
- dest->stripe_pages[i] = s;
- src->stripe_pages[i] = NULL;
+ steal_rbio_page(src, dest, i);
}
index_stripe_sectors(dest);
index_stripe_sectors(src);
@@ -391,6 +272,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
+ dest->stripe_nsectors);
dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -590,9 +474,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio)
int ret = 1;
spin_lock_irqsave(&rbio->bio_list_lock, flags);
- if (size != rbio->nr_data * rbio->stripe_len)
+ if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
ret = 0;
- BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+ BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
return ret;
@@ -932,6 +816,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -1023,29 +913,30 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
- struct btrfs_io_context *bioc,
- u32 stripe_len)
+ struct btrfs_io_context *bioc)
{
const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
- const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
+ const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
const unsigned int num_pages = stripe_npages * real_stripes;
- const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
+ const unsigned int stripe_nsectors =
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
- int nr_data = 0;
void *p;
- ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+ /*
+ * Our current stripe len should be fixed to 64k thus stripe_nsectors
+ * (at most 16) should be no larger than BITS_PER_LONG.
+ */
+ ASSERT(stripe_nsectors <= BITS_PER_LONG);
rbio = kzalloc(sizeof(*rbio) +
sizeof(*rbio->stripe_pages) * num_pages +
sizeof(*rbio->bio_sectors) * num_sectors +
sizeof(*rbio->stripe_sectors) * num_sectors +
- sizeof(*rbio->finish_pointers) * real_stripes +
- sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
- sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
+ sizeof(*rbio->finish_pointers) * real_stripes,
GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -1056,7 +947,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
rbio->bioc = bioc;
- rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
rbio->nr_sectors = num_sectors;
rbio->real_stripes = real_stripes;
@@ -1081,18 +971,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
- CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
- CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
#undef CONSUME_ALLOC
- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
- nr_data = real_stripes - 1;
- else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
- nr_data = real_stripes - 2;
- else
- BUG();
+ ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
+ rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
- rbio->nr_data = nr_data;
return rbio;
}
@@ -1135,7 +1018,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
struct sector_ptr *sector,
unsigned int stripe_nr,
unsigned int sector_nr,
- unsigned long bio_max_len,
unsigned int opf)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
@@ -1180,7 +1062,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
+ bio = bio_alloc(stripe->dev->bdev,
+ max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
opf, GFP_NOFS);
bio->bi_iter.bi_sector = disk_start >> 9;
bio->bi_private = rbio;
@@ -1215,9 +1098,6 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
rbio->bioc->raid_map[0];
- if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_bio(bio)->iter;
-
bio_for_each_segment(bvec, bio, iter) {
u32 bvec_offset;
@@ -1252,6 +1132,34 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
spin_unlock_irq(&rbio->bio_list_lock);
}
+static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
+ struct raid56_bio_trace_info *trace_info)
+{
+ const struct btrfs_io_context *bioc = rbio->bioc;
+ int i;
+
+ ASSERT(bioc);
+
+ /* We rely on bio->bi_bdev to find the stripe number. */
+ if (!bio->bi_bdev)
+ goto not_found;
+
+ for (i = 0; i < bioc->num_stripes; i++) {
+ if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
+ continue;
+ trace_info->stripe_nr = i;
+ trace_info->devid = bioc->stripes[i].dev->devid;
+ trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ bioc->stripes[i].physical;
+ return;
+ }
+
+not_found:
+ trace_info->devid = -1;
+ trace_info->offset = -1;
+ trace_info->stripe_nr = -1;
+}
+
/*
* this is called from one of two situations. We either
* have a full stripe from the higher layers, or we've read all
@@ -1266,7 +1174,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
+ /* The total sector number inside the full stripe. */
+ int total_sector_nr;
int stripe;
+ /* Sector number inside a stripe. */
int sectornr;
bool has_qstripe;
struct bio_list bio_list;
@@ -1282,6 +1193,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
BUG();
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
+
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
@@ -1348,55 +1262,71 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
/*
- * time to start writing. Make bios for everything from the
- * higher layers (the bio_list in our rbio) and our p/q. Ignore
- * everything else.
+ * Start writing. Make bios for everything from the higher layers (the
+ * bio_list in our rbio) and our P/Q. Ignore everything else.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
-
- if (stripe < rbio->nr_data) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (!sector)
- continue;
- } else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- }
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
- sectornr, rbio->stripe_len,
- REQ_OP_WRITE);
- if (ret)
- goto cleanup;
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
}
if (likely(!bioc->num_tgtdevs))
goto write_data;
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- if (!bioc->tgtdev_map[stripe])
- continue;
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
- if (stripe < rbio->nr_data) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (!sector)
- continue;
- } else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- }
+ if (!bioc->tgtdev_map[stripe]) {
+ /*
+ * We can skip the whole stripe completely, note
+ * total_sector_nr will be increased by one anyway.
+ */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
+ continue;
+ }
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- rbio->bioc->tgtdev_map[stripe],
- sectornr, rbio->stripe_len,
- REQ_OP_WRITE);
- if (ret)
- goto cleanup;
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ rbio->bioc->tgtdev_map[stripe],
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
}
write_data:
@@ -1406,6 +1336,12 @@ write_data:
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_end_io = raid_write_end_io;
+ if (trace_raid56_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_write_stripe(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
return;
@@ -1433,7 +1369,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bioc->num_stripes; i++) {
stripe = &rbio->bioc->stripes[i];
- if (in_range(physical, stripe->physical, rbio->stripe_len) &&
+ if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
}
@@ -1455,7 +1391,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->nr_data; i++) {
u64 stripe_start = rbio->bioc->raid_map[i];
- if (in_range(logical, stripe_start, rbio->stripe_len))
+ if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
return i;
}
return -1;
@@ -1552,15 +1488,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
}
}
-/*
- * end io for the read phase of the rmw cycle. All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid_rmw_end_io(struct bio *bio)
+static void raid56_bio_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1571,23 +1499,34 @@ static void raid_rmw_end_io(struct bio *bio)
bio_put(bio);
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ queue_work(rbio->bioc->fs_info->endio_raid56_workers,
+ &rbio->end_io_work);
+}
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- goto cleanup;
+/*
+ * End io handler for the read phase of the RMW cycle. All the bios here are
+ * physical stripe bios we've read from the disk so we can recalculate the
+ * parity of the stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_rmw_end_io_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
+
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ return;
+ }
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write but if there
+ * are any failed stripes we'll reconstruct from parity first.
*/
validate_rbio_for_rmw(rbio);
- return;
-
-cleanup:
-
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
@@ -1598,9 +1537,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
struct bio_list bio_list;
+ const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
int ret;
- int sectornr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -1612,38 +1551,34 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
atomic_set(&rbio->error, 0);
- /*
- * build a list of bios to read all the missing parts of this
- * stripe
- */
- for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
+ /* Build a list of bios to read all the missing data sectors. */
+ for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
- /*
- * We want to find all the sectors missing from the
- * rbio and read them from the disk. If * sector_in_rbio()
- * finds a page in the bio list we don't need to read
- * it off the stripe.
- */
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
- continue;
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a page
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- /*
- * The bio cache may have handed us an uptodate page.
- * If so, be happy and use it.
- */
- if (sector->uptodate)
- continue;
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate page. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, rbio->stripe_len,
- REQ_OP_READ);
- if (ret)
- goto cleanup;
- }
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -1662,11 +1597,16 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_rmw_end_io;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_read_partial_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_read_partial(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -1833,27 +1773,53 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
run_plug(plug);
}
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bioc->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * BTRFS_STRIPE_LEN);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ fs_info->sectorsize_bits) % rbio->stripe_nsectors;
+
+ set_bit(bit, &rbio->dbitmap);
+ }
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
- int ret;
+ int ret = 0;
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
btrfs_put_bioc(bioc);
- return PTR_ERR(rbio);
+ ret = PTR_ERR(rbio);
+ goto out_dec_counter;
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
+ rbio_add_bio(rbio, bio);
- btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
/*
@@ -1863,8 +1829,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri
if (rbio_is_full(rbio)) {
ret = full_stripe_write(rbio);
if (ret)
- btrfs_bio_counter_dec(fs_info);
- return ret;
+ goto out_dec_counter;
+ return;
}
cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
@@ -1875,13 +1841,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
- ret = 0;
} else {
ret = __raid56_parity_write(rbio);
if (ret)
- btrfs_bio_counter_dec(fs_info);
+ goto out_dec_counter;
}
- return ret;
+
+ return;
+
+out_dec_counter:
+ btrfs_bio_counter_dec(fs_info);
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
}
/*
@@ -1939,7 +1910,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* which we have data when doing parity scrub.
*/
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(sectornr, rbio->dbitmap))
+ !test_bit(sectornr, &rbio->dbitmap))
continue;
/*
@@ -2108,25 +2079,13 @@ cleanup_io:
}
/*
- * This is called only for stripes we've read from disk to
- * reconstruct the parity.
+ * This is called only for stripes we've read from disk to reconstruct the
+ * parity.
*/
-static void raid_recover_end_io(struct bio *bio)
+static void raid_recover_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- /*
- * we only read stripe pages off the disk, set them
- * up to date if there were no errors
- */
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
@@ -2147,8 +2106,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int sectornr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -2160,33 +2118,31 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ struct sector_ptr *sector;
+
if (rbio->faila == stripe || rbio->failb == stripe) {
atomic_inc(&rbio->error);
+ /* Skip the current stripe. */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
continue;
}
-
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- if (sector->uptodate)
- continue;
-
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, rbio->stripe_len,
- REQ_OP_READ);
- if (ret < 0)
- goto cleanup;
- }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret < 0)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -2209,11 +2165,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_recover_end_io;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_scrub_read_recover_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
@@ -2236,28 +2197,27 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u32 stripe_len, int mirror_num, int generic_io)
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num, bool generic_io)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- int ret;
if (generic_io) {
ASSERT(bioc->mirror_num == mirror_num);
btrfs_bio(bio)->mirror_num = mirror_num;
+ } else {
+ btrfs_get_bioc(bioc);
}
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- if (generic_io)
- btrfs_put_bioc(bioc);
- return PTR_ERR(rbio);
+ bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
+ goto out_end_bio;
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
@@ -2265,18 +2225,13 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
(u64)bio->bi_iter.bi_size, bioc->map_type);
- if (generic_io)
- btrfs_put_bioc(bioc);
kfree(rbio);
- return -EIO;
+ bio->bi_status = BLK_STS_IOERR;
+ goto out_end_bio;
}
- if (generic_io) {
- btrfs_bio_counter_inc_noblocked(fs_info);
+ if (generic_io)
rbio->generic_bio_cnt = 1;
- } else {
- btrfs_get_bioc(bioc);
- }
/*
* Loop retry:
@@ -2295,24 +2250,20 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
rbio->failb--;
}
- ret = lock_stripe_add(rbio);
+ if (lock_stripe_add(rbio))
+ return;
/*
- * __raid56_parity_recover will end the bio with
- * any errors it hits. We don't want to return
- * its error value up the stack because our caller
- * will end up calling bio_endio with any nonzero
- * return
- */
- if (ret == 0)
- __raid56_parity_recover(rbio);
- /*
- * our rbio has been added to the list of
- * rbios that will be handled after the
- * currently lock owner is done
+ * This adds our rbio to the list of rbios that will be handled after
+ * the current lock owner is done.
*/
- return 0;
+ __raid56_parity_recover(rbio);
+ return;
+out_end_bio:
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_put_bioc(bioc);
+ bio_endio(bio);
}
static void rmw_work(struct work_struct *work)
@@ -2343,14 +2294,14 @@ static void read_rebuild_work(struct work_struct *work)
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
struct btrfs_io_context *bioc,
- u32 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2374,7 +2325,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
}
ASSERT(i < rbio->real_stripes);
- bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+ bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
/*
* We have already increased bio_counter when getting bioc, record it
@@ -2395,7 +2346,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
ASSERT(logical >= rbio->bioc->raid_map[0]);
ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
- rbio->stripe_len * rbio->nr_data);
+ BTRFS_STRIPE_LEN * rbio->nr_data);
stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
index = stripe_offset / sectorsize;
rbio->bio_sectors[index].page = page;
@@ -2409,23 +2360,22 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- int stripe;
- int sectornr;
-
- for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- struct page *page;
- int index = (stripe * rbio->stripe_nsectors + sectornr) *
- sectorsize >> PAGE_SHIFT;
+ int total_sector_nr;
- if (rbio->stripe_pages[index])
- continue;
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct page *page;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[index] = page;
- }
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+ if (rbio->stripe_pages[index])
+ continue;
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[index] = page;
}
index_stripe_sectors(rbio);
return 0;
@@ -2437,7 +2387,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
struct btrfs_io_context *bioc = rbio->bioc;
const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
- unsigned long *pbitmap = rbio->finish_pbitmap;
+ unsigned long *pbitmap = &rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
int sectornr;
@@ -2460,7 +2410,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
- bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors);
+ bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
}
/*
@@ -2497,7 +2447,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* Map the parity stripe just once */
pointers[nr_data] = kmap_local_page(p_sector.page);
- for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
void *parity;
@@ -2525,7 +2475,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
memcpy(parity, pointers[rbio->scrubp], sectorsize);
else
/* Parity is right, needn't writeback */
- bitmap_clear(rbio->dbitmap, sectornr, 1);
+ bitmap_clear(&rbio->dbitmap, sectornr, 1);
kunmap_local(parity);
for (stripe = nr_data - 1; stripe >= 0; stripe--)
@@ -2547,12 +2497,12 @@ writeback:
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
- for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
- sectornr, rbio->stripe_len, REQ_OP_WRITE);
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2566,7 +2516,7 @@ writeback:
sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
ret = rbio_add_io_sector(rbio, &bio_list, sector,
bioc->tgtdev_map[rbio->scrubp],
- sectornr, rbio->stripe_len, REQ_OP_WRITE);
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2584,6 +2534,12 @@ submit_write:
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_end_io = raid_write_end_io;
+ if (trace_raid56_scrub_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
return;
@@ -2671,24 +2627,14 @@ cleanup:
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid56_parity_scrub_end_io(struct bio *bio)
+static void raid56_parity_scrub_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write, but if there
+ * are any failed stripes we'll reconstruct from parity first
*/
validate_rbio_for_parity_scrub(rbio);
}
@@ -2698,8 +2644,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int sectornr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -2709,37 +2654,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
goto cleanup;
atomic_set(&rbio->error, 0);
- /*
- * build a list of bios to read all the missing parts of this
- * stripe
- */
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) {
- struct sector_ptr *sector;
- /*
- * We want to find all the sectors missing from the
- * rbio and read them from the disk. If * sector_in_rbio()
- * finds a sector in the bio list we don't need to read
- * it off the stripe.
- */
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
- continue;
+ /* Build a list of bios to read all the missing parts. */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ struct sector_ptr *sector;
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- /*
- * The bio cache may have handed us an uptodate sector.
- * If so, be happy and use it.
- */
- if (sector->uptodate)
- continue;
+ /* No data in the vertical stripe, no need to read. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, rbio->stripe_len,
- REQ_OP_READ);
- if (ret)
- goto cleanup;
- }
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a sector
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
+
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate sector. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -2758,11 +2704,16 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_parity_scrub_end_io;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_scrub_read_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -2797,13 +2748,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
- u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(fs_info, bioc, length);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio))
return NULL;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index aaad08aefd7d..6f48f9e4c869 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,45 +7,179 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
-static inline int nr_parity_stripes(const struct map_lookup *map)
-{
- if (map->type & BTRFS_BLOCK_GROUP_RAID5)
- return 1;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- return 2;
- else
- return 0;
-}
+#include <linux/workqueue.h>
+#include "volumes.h"
+
+enum btrfs_rbio_ops {
+ BTRFS_RBIO_WRITE,
+ BTRFS_RBIO_READ_REBUILD,
+ BTRFS_RBIO_PARITY_SCRUB,
+ BTRFS_RBIO_REBUILD_MISSING,
+};
+
+struct btrfs_raid_bio {
+ struct btrfs_io_context *bioc;
+
+ /*
+ * While we're doing RMW on a stripe we put it into a hash table so we
+ * can lock the stripe and merge more rbios into it.
+ */
+ struct list_head hash_list;
+
+ /* LRU list for the stripe cache */
+ struct list_head stripe_cache;
+
+ /* For scheduling work in the helper threads */
+ struct work_struct work;
+
+ /*
+ * bio_list and bio_list_lock are used to add more bios into the stripe
+ * in hopes of avoiding the full RMW
+ */
+ struct bio_list bio_list;
+ spinlock_t bio_list_lock;
+
+ /*
+ * Also protected by the bio_list_lock, the plug list is used by the
+ * plugging code to collect partial bios while plugged. The stripe
+ * locking code also uses it to hand off the stripe lock to the next
+ * pending IO.
+ */
+ struct list_head plug_list;
+
+ /* Flags that tell us if it is safe to merge with this bio. */
+ unsigned long flags;
+
+ /*
+ * Set if we're doing a parity rebuild for a read from higher up, which
+ * is handled differently from a parity rebuild as part of RMW.
+ */
+ enum btrfs_rbio_ops operation;
+
+ /* How many pages there are for the full stripe including P/Q */
+ u16 nr_pages;
+
+ /* How many sectors there are for the full stripe including P/Q */
+ u16 nr_sectors;
+
+ /* Number of data stripes (no p/q) */
+ u8 nr_data;
+
+ /* Numer of all stripes (including P/Q) */
+ u8 real_stripes;
+
+ /* How many pages there are for each stripe */
+ u8 stripe_npages;
+
+ /* How many sectors there are for each stripe */
+ u8 stripe_nsectors;
+
+ /* First bad stripe, -1 means no corruption */
+ s8 faila;
+
+ /* Second bad stripe (for RAID6 use) */
+ s8 failb;
+
+ /* Stripe number that we're scrubbing */
+ u8 scrubp;
+
+ /*
+ * Size of all the bios in the bio_list. This helps us decide if the
+ * rbio maps to a full stripe or not.
+ */
+ int bio_list_bytes;
+
+ int generic_bio_cnt;
+
+ refcount_t refs;
+
+ atomic_t stripes_pending;
+
+ atomic_t error;
+
+ struct work_struct end_io_work;
+
+ /* Bitmap to record which horizontal stripe has data */
+ unsigned long dbitmap;
+
+ /* Allocated with stripe_nsectors-many bits for finish_*() calls */
+ unsigned long finish_pbitmap;
+
+ /*
+ * These are two arrays of pointers. We allocate the rbio big enough
+ * to hold them both and setup their locations when the rbio is
+ * allocated.
+ */
+
+ /*
+ * Pointers to pages that we allocated for reading/writing stripes
+ * directly from the disk (including P/Q).
+ */
+ struct page **stripe_pages;
+
+ /* Pointers to the sectors in the bio_list, for faster lookup */
+ struct sector_ptr *bio_sectors;
+
+ /*
+ * For subpage support, we need to map each sector to above
+ * stripe_pages.
+ */
+ struct sector_ptr *stripe_sectors;
+
+ /* Allocated with real_stripes-many pointers for finish_*() calls */
+ void **finish_pointers;
+};
+
+/*
+ * For trace event usage only. Records useful debug info for each bio submitted
+ * by RAID56 to each physical device.
+ *
+ * No matter signed or not, (-1) is always the one indicating we can not grab
+ * the proper stripe number.
+ */
+struct raid56_bio_trace_info {
+ u64 devid;
+
+ /* The offset inside the stripe. (<= STRIPE_LEN) */
+ u32 offset;
+
+ /*
+ * Stripe number.
+ * 0 is the first data stripe, and nr_data for P stripe,
+ * nr_data + 1 for Q stripe.
+ * >= real_stripes for
+ */
+ u8 stripe_nr;
+};
static inline int nr_data_stripes(const struct map_lookup *map)
{
- return map->num_stripes - nr_parity_stripes(map);
+ return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
+
#define RAID5_P_STRIPE ((u64)-2)
#define RAID6_Q_STRIPE ((u64)-1)
#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
((x) == RAID6_Q_STRIPE))
-struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u32 stripe_len, int mirror_num, int generic_io);
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len);
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num, bool generic_io);
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
unsigned int pgoff, u64 logical);
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
- struct btrfs_io_context *bioc, u32 stripe_len,
+ struct btrfs_io_context *bioc,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
- u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index a3549d587464..8a6cabdb8f93 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -5,6 +5,7 @@
#include "compression.h"
#include "ctree.h"
#include "delalloc-space.h"
+#include "disk-io.h"
#include "reflink.h"
#include "transaction.h"
#include "subpage.h"
@@ -110,7 +111,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (comp_type == BTRFS_COMPRESS_NONE) {
memcpy_to_page(page, offset_in_page(file_offset), data_start,
datal);
- flush_dcache_page(page);
} else {
ret = btrfs_decompress(comp_type, data_start, page,
offset_in_page(file_offset),
@@ -132,10 +132,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
*
* So what's in the range [500, 4095] corresponds to zeroes.
*/
- if (datal < block_size) {
+ if (datal < block_size)
memzero_page(page, datal, block_size - datal);
- flush_dcache_page(page);
- }
btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
@@ -658,7 +656,8 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
- const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ const u64 bs = fs_info->sb->s_blocksize;
int ret;
/*
@@ -669,6 +668,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ btrfs_btree_balance_dirty(fs_info);
+
return ret;
}
@@ -778,6 +779,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
round_down(destoff, PAGE_SIZE),
round_up(destoff + len, PAGE_SIZE) - 1);
+ btrfs_btree_balance_dirty(fs_info);
+
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e7b0323e6efd..3afe5fa50a63 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -135,15 +135,13 @@ struct scrub_parity {
struct work_struct work;
/* Mark the parity blocks which have data */
- unsigned long *dbitmap;
+ unsigned long dbitmap;
/*
* Mark the parity blocks which have data, but errors happen when
* read data or check data
*/
- unsigned long *ebitmap;
-
- unsigned long bitmap[];
+ unsigned long ebitmap;
};
struct scrub_ctx {
@@ -1218,7 +1216,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
u64 *raid_map,
- u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
@@ -1233,7 +1230,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
continue;
if (logical >= raid_map[i] &&
- logical < raid_map[i] + mapped_length)
+ logical < raid_map[i] + BTRFS_STRIPE_LEN)
break;
}
@@ -1337,7 +1334,6 @@ leave_nomem:
scrub_stripe_index_and_offset(logical,
bioc->map_type,
bioc->raid_map,
- mapped_length,
bioc->num_stripes -
bioc->num_tgtdevs,
mirror_index,
@@ -1380,19 +1376,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct scrub_sector *sector)
{
DECLARE_COMPLETION_ONSTACK(done);
- int ret;
- int mirror_num;
bio->bi_iter.bi_sector = sector->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
-
- mirror_num = sector->sblock->sectors[0]->mirror_num;
- ret = raid56_parity_recover(bio, sector->recover->bioc,
- sector->recover->map_length,
- mirror_num, 0);
- if (ret)
- return ret;
+ raid56_parity_recover(bio, sector->recover->bioc,
+ sector->sblock->sectors[0]->mirror_num, false);
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
@@ -2197,7 +2186,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(bio, bioc, length);
+ rbio = raid56_alloc_missing_rbio(bio, bioc);
if (!rbio)
goto rbio_out;
@@ -2406,13 +2395,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
u64 start, u32 len)
{
- __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+ __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
}
static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
u64 start, u32 len)
{
- __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+ __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
}
static void scrub_block_complete(struct scrub_block *sblock)
@@ -2763,7 +2752,7 @@ static void scrub_free_parity(struct scrub_parity *sparity)
struct scrub_sector *curr, *next;
int nbits;
- nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+ nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
if (nbits) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors += nbits;
@@ -2795,8 +2784,8 @@ static void scrub_parity_bio_endio(struct bio *bio)
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
if (bio->bi_status)
- bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
- sparity->nsectors);
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
+ &sparity->dbitmap, sparity->nsectors);
bio_put(bio);
@@ -2814,8 +2803,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
u64 length;
int ret;
- if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
- sparity->nsectors))
+ if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
+ &sparity->ebitmap, sparity->nsectors))
goto out;
length = sparity->logic_end - sparity->logic_start;
@@ -2831,9 +2820,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
sparity->scrub_dev,
- sparity->dbitmap,
+ &sparity->dbitmap,
sparity->nsectors);
if (!rbio)
goto rbio_out;
@@ -2847,7 +2836,7 @@ rbio_out:
bioc_out:
btrfs_bio_counter_dec(fs_info);
btrfs_put_bioc(bioc);
- bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2856,11 +2845,6 @@ out:
scrub_free_parity(sparity);
}
-static inline int scrub_calc_parity_bitmap_len(int nsectors)
-{
- return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
-}
-
static void scrub_parity_get(struct scrub_parity *sparity)
{
refcount_inc(&sparity->refs);
@@ -3131,7 +3115,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int ret;
struct scrub_parity *sparity;
int nsectors;
- int bitmap_len;
path = btrfs_alloc_path();
if (!path) {
@@ -3145,9 +3128,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
ASSERT(map->stripe_len <= U32_MAX);
nsectors = map->stripe_len >> fs_info->sectorsize_bits;
- bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
- sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
- GFP_NOFS);
+ ASSERT(nsectors <= BITS_PER_LONG);
+ sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
if (!sparity) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -3165,8 +3147,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->logic_end = logic_end;
refcount_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->sectors_list);
- sparity->dbitmap = sparity->bitmap;
- sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
ret = 0;
for (cur_logical = logic_start; cur_logical < logic_end;
@@ -3429,20 +3409,22 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct extent_map *em,
struct btrfs_device *scrub_dev,
- int stripe_index, u64 dev_extent_len)
+ int stripe_index)
{
struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root;
struct btrfs_root *csum_root;
struct blk_plug plug;
+ struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
int ret;
u64 physical = map->stripes[stripe_index].physical;
- const u64 physical_end = physical + dev_extent_len;
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
/* The logical increment after finishing one stripe */
@@ -3569,8 +3551,8 @@ next:
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
if (stop_loop)
- sctx->stat.last_physical = map->stripes[stripe_index].physical +
- dev_extent_len;
+ sctx->stat.last_physical =
+ map->stripes[stripe_index].physical + dev_stripe_len;
else
sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
@@ -3639,8 +3621,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
- dev_extent_len);
+ ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
if (ret)
goto out;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fa56890ff81f..93130367fef2 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -16,6 +16,7 @@
#include <linux/crc32c.h>
#include "send.h"
+#include "ctree.h"
#include "backref.h"
#include "locking.h"
#include "disk-io.h"
@@ -81,8 +82,12 @@ struct send_ctx {
char *send_buf;
u32 send_size;
u32 send_max_size;
- u64 total_send_size;
- u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+ /*
+ * Whether BTRFS_SEND_A_DATA attribute was already added to current
+ * command (since protocol v2, data must be the last attribute).
+ */
+ bool put_data;
+ struct page **send_buf_pages;
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
/* Protocol version compatibility requested */
u32 proto;
@@ -112,14 +117,14 @@ struct send_ctx {
*/
u64 cur_ino;
u64 cur_inode_gen;
- int cur_inode_new;
- int cur_inode_new_gen;
- int cur_inode_deleted;
u64 cur_inode_size;
u64 cur_inode_mode;
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ bool cur_inode_new;
+ bool cur_inode_new_gen;
+ bool cur_inode_deleted;
bool ignore_cur_inode;
u64 send_progress;
@@ -333,8 +338,8 @@ __maybe_unused
static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
{
switch (sctx->proto) {
- case 1: return cmd < __BTRFS_SEND_C_MAX_V1;
- case 2: return cmd < __BTRFS_SEND_C_MAX_V2;
+ case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
default: return false;
}
}
@@ -575,15 +580,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
while (pos < len) {
ret = kernel_write(filp, buf + pos, len - pos, off);
- /* TODO handle that correctly */
- /*if (ret == -ERESTARTSYS) {
- continue;
- }*/
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (ret == 0)
return -EIO;
- }
pos += ret;
}
@@ -596,6 +596,9 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
int total_len = sizeof(*hdr) + len;
int left = sctx->send_max_size - sctx->send_size;
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+
if (unlikely(left < total_len))
return -EOVERFLOW;
@@ -616,6 +619,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
}
+TLV_PUT_DEFINE_INT(32)
TLV_PUT_DEFINE_INT(64)
static int tlv_put_string(struct send_ctx *sctx, u16 attr,
@@ -691,8 +695,7 @@ static int send_header(struct send_ctx *sctx)
struct btrfs_stream_header hdr;
strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
- hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
-
+ hdr.version = cpu_to_le32(sctx->proto);
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
&sctx->send_off);
}
@@ -732,9 +735,8 @@ static int send_cmd(struct send_ctx *sctx)
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
- sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
+ sctx->put_data = false;
return ret;
}
@@ -840,7 +842,7 @@ out:
*/
static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
- u64 *gid, u64 *rdev)
+ u64 *gid, u64 *rdev, u64 *fileattr)
{
int ret;
struct btrfs_inode_item *ii;
@@ -870,6 +872,12 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
*gid = btrfs_inode_gid(path->nodes[0], ii);
if (rdev)
*rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ /*
+ * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
+ * otherwise logically split to 32/32 parts.
+ */
+ if (fileattr)
+ *fileattr = btrfs_inode_flags(path->nodes[0], ii);
return ret;
}
@@ -877,7 +885,7 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
static int get_inode_info(struct btrfs_root *root,
u64 ino, u64 *size, u64 *gen,
u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev)
+ u64 *rdev, u64 *fileattr)
{
struct btrfs_path *path;
int ret;
@@ -886,7 +894,7 @@ static int get_inode_info(struct btrfs_root *root,
if (!path)
return -ENOMEM;
ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
- rdev);
+ rdev, fileattr);
btrfs_free_path(path);
return ret;
}
@@ -1632,7 +1640,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
u64 right_gen;
ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
- NULL, NULL);
+ NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
left_ret = ret;
@@ -1641,7 +1649,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
right_ret = -ENOENT;
} else {
ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
- NULL, NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
right_ret = ret;
@@ -1804,7 +1812,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
if (dir_gen) {
ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
}
@@ -1876,7 +1884,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
*/
if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1904,7 +1912,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (other_inode > sctx->send_progress ||
is_waiting_for_move(sctx, other_inode)) {
ret = get_inode_info(sctx->parent_root, other_inode, NULL,
- who_gen, who_mode, NULL, NULL, NULL);
+ who_gen, who_mode, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
@@ -1943,7 +1951,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
if (dir != BTRFS_FIRST_FREE_OBJECTID) {
ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1966,7 +1974,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
}
ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
- NULL, NULL);
+ NULL, NULL, NULL);
if (ret < 0)
goto out;
@@ -2495,6 +2503,39 @@ out:
return ret;
}
+static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ if (sctx->proto < 2)
+ return 0;
+
+ btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
{
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
@@ -2574,7 +2615,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
- /* TODO Add otime support when the otime patches get into upstream */
+ if (sctx->proto >= 2)
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
ret = send_cmd(sctx);
@@ -2608,7 +2650,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
if (ino != sctx->cur_ino) {
ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
- NULL, NULL, &rdev);
+ NULL, NULL, &rdev, NULL);
if (ret < 0)
goto out;
} else {
@@ -3311,7 +3353,7 @@ finish:
* The parent inode might have been deleted in the send snapshot
*/
ret = get_inode_info(sctx->send_root, cur->dir, NULL,
- NULL, NULL, NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL, NULL, NULL);
if (ret == -ENOENT) {
ret = 0;
continue;
@@ -3486,11 +3528,11 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
}
ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
- &left_gen, NULL, NULL, NULL, NULL);
+ &left_gen, NULL, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
- &right_gen, NULL, NULL, NULL, NULL);
+ &right_gen, NULL, NULL, NULL, NULL, NULL);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
@@ -3621,7 +3663,7 @@ static int is_ancestor(struct btrfs_root *root,
}
ret = get_inode_info(root, parent, NULL, &parent_gen,
- NULL, NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
ret = check_ino_in_path(root, ino1, ino1_gen,
@@ -3713,7 +3755,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
ret = get_inode_info(sctx->parent_root, ino, NULL,
&parent_ino_gen, NULL, NULL, NULL,
- NULL);
+ NULL, NULL);
if (ret < 0)
goto out;
if (ino_gen == parent_ino_gen) {
@@ -4319,8 +4361,7 @@ static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
if (!p)
return -ENOMEM;
- ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
@@ -4408,7 +4449,7 @@ static int __find_iref(int num, u64 dir, int index,
* else matches.
*/
ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret)
return ret;
if (dir_gen != ctx->dir_gen)
@@ -4452,7 +4493,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
struct send_ctx *sctx = ctx;
ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret)
return ret;
@@ -4475,7 +4516,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
struct send_ctx *sctx = ctx;
ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret)
return ret;
@@ -4860,14 +4901,28 @@ static inline u64 max_send_read_size(const struct send_ctx *sctx)
static int put_data_header(struct send_ctx *sctx, u32 len)
{
- struct btrfs_tlv_header *hdr;
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+ sctx->put_data = true;
+ if (sctx->proto >= 2) {
+ /*
+ * Since v2, the data attribute header doesn't include a length,
+ * it is implicitly to the end of the command.
+ */
+ if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
+ return -EOVERFLOW;
+ put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
+ sctx->send_size += sizeof(__le16);
+ } else {
+ struct btrfs_tlv_header *hdr;
- if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
- return -EOVERFLOW;
- hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
- put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
- put_unaligned_le16(len, &hdr->tlv_len);
- sctx->send_size += sizeof(*hdr);
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ }
return 0;
}
@@ -5010,7 +5065,7 @@ static int send_clone(struct send_ctx *sctx,
if (clone_root->root == sctx->send_root) {
ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
- &gen, NULL, NULL, NULL, NULL);
+ &gen, NULL, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -5137,17 +5192,214 @@ tlv_put_failure:
return ret;
}
-static int send_extent_data(struct send_ctx *sctx,
- const u64 offset,
- const u64 len)
+static int send_encoded_inline_extent(struct send_ctx *sctx,
+ struct btrfs_path *path, u64 offset,
+ u64 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 ram_bytes;
+ size_t inline_size;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei);
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + ram_bytes - offset, len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+
+ ret = put_data_header(sctx, inline_size);
+ if (ret < 0)
+ goto out;
+ read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
+ btrfs_file_extent_inline_start(ei), inline_size);
+ sctx->send_size += inline_size;
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
+ u64 offset, u64 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 disk_bytenr, disk_num_bytes;
+ u32 data_offset;
+ struct btrfs_cmd_header *hdr;
+ u32 crc;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
+ len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
+ btrfs_file_extent_ram_bytes(leaf, ei));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
+ offset - key.offset + btrfs_file_extent_offset(leaf, ei));
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0);
+
+ ret = put_data_header(sctx, disk_num_bytes);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We want to do I/O directly into the send buffer, so get the next page
+ * boundary in the send buffer. This means that there may be a gap
+ * between the beginning of the command and the file data.
+ */
+ data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
+ if (data_offset > sctx->send_max_size ||
+ sctx->send_max_size - data_offset < disk_num_bytes) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+
+ /*
+ * Note that send_buf is a mapping of send_buf_pages, so this is really
+ * reading into send_buf.
+ */
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ disk_bytenr, disk_num_bytes,
+ sctx->send_buf_pages +
+ (data_offset >> PAGE_SHIFT));
+ if (ret)
+ goto out;
+
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
+ hdr->crc = 0;
+ crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
+ crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+ hdr->crc = cpu_to_le32(crc);
+
+ ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+ &sctx->send_off);
+ if (!ret) {
+ ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset,
+ disk_num_bytes, &sctx->send_off);
+ }
+ sctx->send_size = 0;
+ sctx->put_data = false;
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
+ const u64 offset, const u64 len)
{
const u64 end = offset + len;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
+ btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
+ bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_INLINE);
+
+ /*
+ * Send the compressed extent unless the compressed data is
+ * larger than the decompressed data. This can happen if we're
+ * not sending the entire extent, either because it has been
+ * partially overwritten/truncated or because this is a part of
+ * the extent that we couldn't clone in clone_range().
+ */
+ if (is_inline &&
+ btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]) <= len) {
+ return send_encoded_inline_extent(sctx, path, offset,
+ len);
+ } else if (!is_inline &&
+ btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) {
+ return send_encoded_extent(sctx, path, offset, len);
+ }
+ }
+
if (sctx->cur_inode == NULL) {
struct btrfs_root *root = sctx->send_root;
@@ -5285,12 +5537,9 @@ out:
return ret;
}
-static int clone_range(struct send_ctx *sctx,
- struct clone_root *clone_root,
- const u64 disk_byte,
- u64 data_offset,
- u64 offset,
- u64 len)
+static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
+ struct clone_root *clone_root, const u64 disk_byte,
+ u64 data_offset, u64 offset, u64 len)
{
struct btrfs_path *path;
struct btrfs_key key;
@@ -5314,7 +5563,7 @@ static int clone_range(struct send_ctx *sctx,
*/
if (clone_root->offset == 0 &&
len == sctx->send_root->fs_info->sectorsize)
- return send_extent_data(sctx, offset, len);
+ return send_extent_data(sctx, dst_path, offset, len);
path = alloc_path_for_send();
if (!path)
@@ -5325,7 +5574,8 @@ static int clone_range(struct send_ctx *sctx,
* accept clones from these extents.
*/
ret = __get_inode_info(clone_root->root, path, clone_root->ino,
- &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
+ &clone_src_i_size, NULL, NULL, NULL, NULL, NULL,
+ NULL);
btrfs_release_path(path);
if (ret < 0)
goto out;
@@ -5411,7 +5661,8 @@ static int clone_range(struct send_ctx *sctx,
if (hole_len > len)
hole_len = len;
- ret = send_extent_data(sctx, offset, hole_len);
+ ret = send_extent_data(sctx, dst_path, offset,
+ hole_len);
if (ret < 0)
goto out;
@@ -5484,14 +5735,16 @@ static int clone_range(struct send_ctx *sctx,
if (ret < 0)
goto out;
}
- ret = send_extent_data(sctx, offset + slen,
+ ret = send_extent_data(sctx, dst_path,
+ offset + slen,
clone_len - slen);
} else {
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
} else {
- ret = send_extent_data(sctx, offset, clone_len);
+ ret = send_extent_data(sctx, dst_path, offset,
+ clone_len);
}
if (ret < 0)
@@ -5523,7 +5776,7 @@ next:
}
if (len > 0)
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, dst_path, offset, len);
else
ret = 0;
out:
@@ -5554,10 +5807,10 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
- ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, end - offset);
+ ret = clone_range(sctx, path, clone_root, disk_byte,
+ data_offset, offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, end - offset);
+ ret = send_extent_data(sctx, path, offset, end - offset);
}
sctx->cur_inode_next_write_offset = end;
return ret;
@@ -6017,11 +6270,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
u64 left_mode;
u64 left_uid;
u64 left_gid;
+ u64 left_fileattr;
u64 right_mode;
u64 right_uid;
u64 right_gid;
+ u64 right_fileattr;
int need_chmod = 0;
int need_chown = 0;
+ bool need_fileattr = false;
int need_truncate = 1;
int pending_move = 0;
int refs_processed = 0;
@@ -6055,7 +6311,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
- &left_mode, &left_uid, &left_gid, NULL);
+ &left_mode, &left_uid, &left_gid, NULL, &left_fileattr);
if (ret < 0)
goto out;
@@ -6070,7 +6326,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
&old_size, NULL, &right_mode, &right_uid,
- &right_gid, NULL);
+ &right_gid, NULL, &right_fileattr);
if (ret < 0)
goto out;
@@ -6078,6 +6334,8 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
need_chmod = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
+ need_fileattr = true;
if ((old_size == sctx->cur_inode_size) ||
(sctx->cur_inode_size > old_size &&
sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
@@ -6121,6 +6379,12 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret < 0)
goto out;
}
+ if (need_fileattr) {
+ ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_fileattr);
+ if (ret < 0)
+ goto out;
+ }
ret = send_capabilities(sctx);
if (ret < 0)
@@ -6265,7 +6529,7 @@ static int changed_inode(struct send_ctx *sctx,
close_current_inode(sctx);
sctx->cur_ino = key->objectid;
- sctx->cur_inode_new_gen = 0;
+ sctx->cur_inode_new_gen = false;
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
sctx->ignore_cur_inode = false;
@@ -6306,7 +6570,7 @@ static int changed_inode(struct send_ctx *sctx,
*/
if (left_gen != right_gen &&
sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
- sctx->cur_inode_new_gen = 1;
+ sctx->cur_inode_new_gen = true;
}
/*
@@ -6338,8 +6602,8 @@ static int changed_inode(struct send_ctx *sctx,
if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 1;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6350,8 +6614,8 @@ static int changed_inode(struct send_ctx *sctx,
ret = send_create_inode_if_needed(sctx);
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
sctx->cur_inode_size = btrfs_inode_size(
sctx->right_path->nodes[0], right_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6369,8 +6633,8 @@ static int changed_inode(struct send_ctx *sctx,
* First, process the inode as if it was deleted.
*/
sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
sctx->cur_inode_size = btrfs_inode_size(
sctx->right_path->nodes[0], right_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6384,8 +6648,8 @@ static int changed_inode(struct send_ctx *sctx,
* Now process the inode as if it was new.
*/
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 1;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6417,9 +6681,9 @@ static int changed_inode(struct send_ctx *sctx,
goto out;
} else {
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_new_gen = 0;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_new_gen = false;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6532,12 +6796,12 @@ static int dir_changed(struct send_ctx *sctx, u64 dir)
int ret;
ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
- NULL, NULL);
+ NULL, NULL, NULL);
if (ret)
return ret;
ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret)
return ret;
@@ -7533,6 +7797,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
} else {
sctx->proto = 1;
}
+ if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
+ ret = -EINVAL;
+ goto out;
+ }
sctx->send_filp = fget(arg->send_fd);
if (!sctx->send_filp) {
@@ -7552,8 +7820,31 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
sctx->clone_roots_cnt = arg->clone_sources_count;
- sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
+ if (sctx->proto >= 2) {
+ u32 send_buf_num_pages;
+
+ sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+ sctx->send_buf = vmalloc(sctx->send_max_size);
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
+ sctx->send_buf_pages = kcalloc(send_buf_num_pages,
+ sizeof(*sctx->send_buf_pages),
+ GFP_KERNEL);
+ if (!sctx->send_buf_pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < send_buf_num_pages; i++) {
+ sctx->send_buf_pages[i] =
+ vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT));
+ }
+ } else {
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
+ sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
+ }
if (!sctx->send_buf) {
ret = -ENOMEM;
goto out;
@@ -7746,6 +8037,7 @@ out:
fput(sctx->send_filp);
kvfree(sctx->clone_roots);
+ kfree(sctx->send_buf_pages);
kvfree(sctx->send_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 08602fdd600a..4bb4e6a638cb 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -7,12 +7,19 @@
#ifndef BTRFS_SEND_H
#define BTRFS_SEND_H
-#include "ctree.h"
+#include <linux/types.h>
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
-#define BTRFS_SEND_STREAM_VERSION 1
+#define BTRFS_SEND_STREAM_VERSION 2
-#define BTRFS_SEND_BUF_SIZE SZ_64K
+/*
+ * In send stream v1, no command is larger than 64K. In send stream v2, no limit
+ * should be assumed.
+ */
+#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
+
+struct inode;
+struct btrfs_ioctl_send_args;
enum btrfs_tlv_type {
BTRFS_TLV_U8,
@@ -46,87 +53,117 @@ struct btrfs_tlv_header {
/* commands */
enum btrfs_send_cmd {
- BTRFS_SEND_C_UNSPEC,
+ BTRFS_SEND_C_UNSPEC = 0,
/* Version 1 */
- BTRFS_SEND_C_SUBVOL,
- BTRFS_SEND_C_SNAPSHOT,
+ BTRFS_SEND_C_SUBVOL = 1,
+ BTRFS_SEND_C_SNAPSHOT = 2,
- BTRFS_SEND_C_MKFILE,
- BTRFS_SEND_C_MKDIR,
- BTRFS_SEND_C_MKNOD,
- BTRFS_SEND_C_MKFIFO,
- BTRFS_SEND_C_MKSOCK,
- BTRFS_SEND_C_SYMLINK,
+ BTRFS_SEND_C_MKFILE = 3,
+ BTRFS_SEND_C_MKDIR = 4,
+ BTRFS_SEND_C_MKNOD = 5,
+ BTRFS_SEND_C_MKFIFO = 6,
+ BTRFS_SEND_C_MKSOCK = 7,
+ BTRFS_SEND_C_SYMLINK = 8,
- BTRFS_SEND_C_RENAME,
- BTRFS_SEND_C_LINK,
- BTRFS_SEND_C_UNLINK,
- BTRFS_SEND_C_RMDIR,
+ BTRFS_SEND_C_RENAME = 9,
+ BTRFS_SEND_C_LINK = 10,
+ BTRFS_SEND_C_UNLINK = 11,
+ BTRFS_SEND_C_RMDIR = 12,
- BTRFS_SEND_C_SET_XATTR,
- BTRFS_SEND_C_REMOVE_XATTR,
+ BTRFS_SEND_C_SET_XATTR = 13,
+ BTRFS_SEND_C_REMOVE_XATTR = 14,
- BTRFS_SEND_C_WRITE,
- BTRFS_SEND_C_CLONE,
+ BTRFS_SEND_C_WRITE = 15,
+ BTRFS_SEND_C_CLONE = 16,
- BTRFS_SEND_C_TRUNCATE,
- BTRFS_SEND_C_CHMOD,
- BTRFS_SEND_C_CHOWN,
- BTRFS_SEND_C_UTIMES,
+ BTRFS_SEND_C_TRUNCATE = 17,
+ BTRFS_SEND_C_CHMOD = 18,
+ BTRFS_SEND_C_CHOWN = 19,
+ BTRFS_SEND_C_UTIMES = 20,
- BTRFS_SEND_C_END,
- BTRFS_SEND_C_UPDATE_EXTENT,
- __BTRFS_SEND_C_MAX_V1,
+ BTRFS_SEND_C_END = 21,
+ BTRFS_SEND_C_UPDATE_EXTENT = 22,
+ BTRFS_SEND_C_MAX_V1 = 22,
/* Version 2 */
- __BTRFS_SEND_C_MAX_V2,
+ BTRFS_SEND_C_FALLOCATE = 23,
+ BTRFS_SEND_C_FILEATTR = 24,
+ BTRFS_SEND_C_ENCODED_WRITE = 25,
+ BTRFS_SEND_C_MAX_V2 = 25,
/* End */
- __BTRFS_SEND_C_MAX,
+ BTRFS_SEND_C_MAX = 25,
};
-#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
/* attributes in send stream */
enum {
- BTRFS_SEND_A_UNSPEC,
-
- BTRFS_SEND_A_UUID,
- BTRFS_SEND_A_CTRANSID,
-
- BTRFS_SEND_A_INO,
- BTRFS_SEND_A_SIZE,
- BTRFS_SEND_A_MODE,
- BTRFS_SEND_A_UID,
- BTRFS_SEND_A_GID,
- BTRFS_SEND_A_RDEV,
- BTRFS_SEND_A_CTIME,
- BTRFS_SEND_A_MTIME,
- BTRFS_SEND_A_ATIME,
- BTRFS_SEND_A_OTIME,
-
- BTRFS_SEND_A_XATTR_NAME,
- BTRFS_SEND_A_XATTR_DATA,
-
- BTRFS_SEND_A_PATH,
- BTRFS_SEND_A_PATH_TO,
- BTRFS_SEND_A_PATH_LINK,
-
- BTRFS_SEND_A_FILE_OFFSET,
- BTRFS_SEND_A_DATA,
-
- BTRFS_SEND_A_CLONE_UUID,
- BTRFS_SEND_A_CLONE_CTRANSID,
- BTRFS_SEND_A_CLONE_PATH,
- BTRFS_SEND_A_CLONE_OFFSET,
- BTRFS_SEND_A_CLONE_LEN,
-
- __BTRFS_SEND_A_MAX,
+ BTRFS_SEND_A_UNSPEC = 0,
+
+ /* Version 1 */
+ BTRFS_SEND_A_UUID = 1,
+ BTRFS_SEND_A_CTRANSID = 2,
+
+ BTRFS_SEND_A_INO = 3,
+ BTRFS_SEND_A_SIZE = 4,
+ BTRFS_SEND_A_MODE = 5,
+ BTRFS_SEND_A_UID = 6,
+ BTRFS_SEND_A_GID = 7,
+ BTRFS_SEND_A_RDEV = 8,
+ BTRFS_SEND_A_CTIME = 9,
+ BTRFS_SEND_A_MTIME = 10,
+ BTRFS_SEND_A_ATIME = 11,
+ BTRFS_SEND_A_OTIME = 12,
+
+ BTRFS_SEND_A_XATTR_NAME = 13,
+ BTRFS_SEND_A_XATTR_DATA = 14,
+
+ BTRFS_SEND_A_PATH = 15,
+ BTRFS_SEND_A_PATH_TO = 16,
+ BTRFS_SEND_A_PATH_LINK = 17,
+
+ BTRFS_SEND_A_FILE_OFFSET = 18,
+ /*
+ * As of send stream v2, this attribute is special: it must be the last
+ * attribute in a command, its header contains only the type, and its
+ * length is implicitly the remaining length of the command.
+ */
+ BTRFS_SEND_A_DATA = 19,
+
+ BTRFS_SEND_A_CLONE_UUID = 20,
+ BTRFS_SEND_A_CLONE_CTRANSID = 21,
+ BTRFS_SEND_A_CLONE_PATH = 22,
+ BTRFS_SEND_A_CLONE_OFFSET = 23,
+ BTRFS_SEND_A_CLONE_LEN = 24,
+
+ BTRFS_SEND_A_MAX_V1 = 24,
+
+ /* Version 2 */
+ BTRFS_SEND_A_FALLOCATE_MODE = 25,
+
+ /*
+ * File attributes from the FS_*_FL namespace (i_flags, xflags),
+ * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored
+ * in btrfs_inode_item::flags (represented by btrfs_inode::flags and
+ * btrfs_inode::ro_flags).
+ */
+ BTRFS_SEND_A_FILEATTR = 26,
+
+ BTRFS_SEND_A_UNENCODED_FILE_LEN = 27,
+ BTRFS_SEND_A_UNENCODED_LEN = 28,
+ BTRFS_SEND_A_UNENCODED_OFFSET = 29,
+ /*
+ * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from
+ * BTRFS_SEND_C_ENCODED_WRITE.
+ */
+ BTRFS_SEND_A_COMPRESSION = 30,
+ BTRFS_SEND_A_ENCRYPTION = 31,
+ BTRFS_SEND_A_MAX_V2 = 31,
+
+ /* End */
+ BTRFS_SEND_A_MAX = 31,
};
-#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
-#ifdef __KERNEL__
long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
-#endif
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 2dd8754cb990..62d25112310d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -187,6 +187,37 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
*/
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+/*
+ * Calculate chunk size depending on volume type (regular or zoned).
+ */
+static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
+{
+ if (btrfs_is_zoned(fs_info))
+ return fs_info->zone_size;
+
+ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ return SZ_1G;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return SZ_32M;
+
+ /* Handle BTRFS_BLOCK_GROUP_METADATA */
+ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ return SZ_1G;
+
+ return SZ_256M;
+}
+
+/*
+ * Update default chunk size.
+ */
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size)
+{
+ WRITE_ONCE(space_info->chunk_size, chunk_size);
+}
+
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
@@ -208,6 +239,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
+ btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
if (btrfs_is_zoned(info))
space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
@@ -1280,7 +1312,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
/*
* This is the priority reclaim path, so to_reclaim could be >0 still
- * because we may have only satisified the priority tickets and still
+ * because we may have only satisfied the priority tickets and still
* left non priority tickets on the list. We would then have
* to_reclaim but ->bytes == 0.
*/
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index c096695598c1..e7de24a529cf 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -25,6 +25,8 @@ struct btrfs_space_info {
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
+ /* Chunk size in bytes */
+ u64 chunk_size;
/*
* Once a block group drops below this threshold (percents) we'll
@@ -123,6 +125,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
u64 bytes_readonly, u64 bytes_zone_unusable,
struct btrfs_space_info **space_info);
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index f429256f56db..12455b2b41de 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -12,15 +12,10 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
{
const unsigned long member_offset = (unsigned long)ptr + off;
- if (member_offset > eb->len) {
+ if (unlikely(member_offset + size > eb->len)) {
btrfs_warn(eb->fs_info,
- "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d",
- (unsigned long)ptr, eb->start, member_offset, size);
- return false;
- }
- if (member_offset + size > eb->len) {
- btrfs_warn(eb->fs_info,
- "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d",
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
(unsigned long)ptr, eb->start, member_offset, size);
return false;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index a105b291444f..6fc2b77ae5c3 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -123,7 +123,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage *subpage;
/*
- * We have cases like a dummy extent buffer page, which is not mappped
+ * We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
*/
if (page->mapping)
@@ -731,7 +731,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* It should not have any subpage::writers count.
* Can be unlocked by unlock_page().
* This is the most common locked page for __extent_writepage() called
- * inside extent_write_cache_pages() or extent_write_full_page().
+ * inside extent_write_cache_pages().
* Rarer cases include the @locked_page from extent_write_locked_range().
*
* - Page locked by lock_delalloc_pages()
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6627dd7875ee..41652dcd16f4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,7 @@
#include "block-group.h"
#include "discard.h"
#include "qgroup.h"
+#include "raid56.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -72,7 +73,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data);
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
/*
- * Characters to print to indicate error conditions or uncommon filesystem sate.
+ * Characters to print to indicate error conditions or uncommon filesystem state.
* RO is not an error.
*/
static const char fs_state_chars[] = {
@@ -1931,10 +1932,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
- new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
@@ -2275,17 +2272,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
/*
- * In order to avoid overwriting the superblock on the drive,
- * btrfs starts at an offset of at least 1MB when doing chunk
- * allocation.
- *
- * This ensures we have at least min_stripe_size free space
- * after excluding 1MB.
+ * Ensure we have at least min_stripe_size on top of the
+ * reserved space on the device.
*/
- if (avail_space <= SZ_1M + min_stripe_size)
+ if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size)
continue;
- avail_space -= SZ_1M;
+ avail_space -= BTRFS_DEVICE_RANGE_RESERVED;
devices_info[i].dev = device;
devices_info[i].max_avail = avail_space;
@@ -2703,13 +2696,9 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_delayed_ref;
- err = btrfs_end_io_wq_init();
- if (err)
- goto free_prelim_ref;
-
err = btrfs_interface_init();
if (err)
- goto free_end_io_wq;
+ goto free_prelim_ref;
btrfs_print_mod_info();
@@ -2725,8 +2714,6 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
-free_end_io_wq:
- btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
@@ -2764,7 +2751,6 @@ static void __exit exit_btrfs_fs(void)
extent_state_cache_exit();
extent_io_exit();
btrfs_interface_exit();
- btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 92a1fa8e3da6..c6307b111c2c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,6 +21,7 @@
#include "space-info.h"
#include "block-group.h"
#include "qgroup.h"
+#include "misc.h"
/*
* Structure name Path
@@ -61,6 +62,10 @@ struct raid_kobject {
.store = _store, \
}
+#define BTRFS_ATTR_W(_prefix, _name, _store) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
+
#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
@@ -92,6 +97,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
+static struct kobject *get_btrfs_kobj(struct kobject *kobj);
static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
{
@@ -283,9 +289,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
-#ifdef CONFIG_BTRFS_DEBUG
-/* Remove once support for zoned allocation is feature complete */
+#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
#endif
@@ -296,7 +303,7 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
/*
* Features which depend on feature bits and may differ between each fs.
*
- * /sys/fs/btrfs/features - all available features implemeted by this version
+ * /sys/fs/btrfs/features - all available features implemented by this version
* /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
* can be changed on a mounted filesystem.
*/
@@ -314,8 +321,10 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
#endif
#ifdef CONFIG_FS_VERITY
@@ -709,6 +718,112 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
} \
BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
+static ssize_t btrfs_chunk_size_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size));
+}
+
+/*
+ * Store new chunk size in space info. Can be called on a read-only filesystem.
+ *
+ * If the new chunk size value is larger than 10% of free space it is reduced
+ * to match that limit. Alignment must be to 256M and the system chunk size
+ * cannot be set.
+ */
+static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ char *retptr;
+ u64 val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!fs_info->fs_devices)
+ return -EINVAL;
+
+ if (btrfs_is_zoned(fs_info))
+ return -EINVAL;
+
+ /* System block type must not be changed. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return -EPERM;
+
+ val = memparse(buf, &retptr);
+ /* There could be trailing '\n', also catch any typos after the value */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || val == 0)
+ return -EINVAL;
+
+ val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
+
+ /* Limit stripe size to 10% of available space. */
+ val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
+
+ /* Must be multiple of 256M. */
+ val &= ~((u64)SZ_256M - 1);
+
+ /* Must be at least 256M. */
+ if (val < SZ_256M)
+ return -EINVAL;
+
+ btrfs_update_space_info_chunk_size(space_info, val);
+
+ return len;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Request chunk allocation with current chunk size.
+ */
+static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ struct btrfs_trans_handle *trans;
+ bool val;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ return -EINVAL;
+
+ /*
+ * This is unsafe to be called from sysfs context and may cause
+ * unexpected problems.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_force_chunk_alloc(trans, space_info->flags);
+ btrfs_end_transaction(trans);
+
+ if (ret == 1)
+ return len;
+
+ return -ENOSPC;
+}
+BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store);
+
+#endif
+
SPACE_INFO_ATTR(flags);
SPACE_INFO_ATTR(total_bytes);
SPACE_INFO_ATTR(bytes_used);
@@ -719,6 +834,7 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -773,6 +889,10 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(space_info, chunk_size),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(space_info);
@@ -871,6 +991,48 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
+static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf,
+ "commits %llu\n"
+ "last_commit_ms %llu\n"
+ "max_commit_ms %llu\n"
+ "total_commit_ms %llu\n",
+ fs_info->commit_stats.commit_count,
+ div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
+}
+
+static ssize_t btrfs_commit_stats_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ unsigned long val;
+ int ret;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return ret;
+ if (val)
+ return -EINVAL;
+
+ WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0);
+
+ return len;
+}
+BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store);
+
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1110,6 +1272,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, generation),
BTRFS_ATTR_PTR(, read_policy),
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(, commit_stats),
NULL,
};
@@ -1140,6 +1303,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
return to_fs_devs(kobj)->fs_info;
}
+static struct kobject *get_btrfs_kobj(struct kobject *kobj)
+{
+ while (kobj) {
+ if (kobj->ktype == &btrfs_ktype)
+ return kobj;
+ kobj = kobj->parent;
+ }
+ return NULL;
+}
+
#define NUM_FEATURE_BITS 64
#define BTRFS_FEATURE_NAME_MAX 13
static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
@@ -2106,4 +2279,3 @@ void __cold btrfs_exit_sysfs(void)
#endif
kset_unregister(btrfs_kset);
}
-
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 06c0a958d114..4280676f7279 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
+#include <linux/timekeeping.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
@@ -2084,12 +2085,23 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
+static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+{
+ fs_info->commit_stats.commit_count++;
+ fs_info->commit_stats.last_commit_dur = interval;
+ fs_info->commit_stats.max_commit_dur =
+ max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
+ fs_info->commit_stats.total_commit_dur += interval;
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ ktime_t start_time;
+ ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
@@ -2214,6 +2226,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
}
+ /*
+ * Get the time spent on the work done by the commit thread and not
+ * the time spent waiting on a previous commit
+ */
+ start_time = ktime_get_ns();
+
extwriter_counter_dec(cur_trans, trans->type);
ret = btrfs_start_delalloc_flush(fs_info);
@@ -2455,6 +2473,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trace_btrfs_transaction_commit(fs_info);
+ interval = ktime_get_ns() - start_time;
+
btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
@@ -2462,6 +2482,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ update_commit_stats(fs_info, interval);
+
return ret;
unlock_reloc:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 370388fadf96..d898ba13285f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -171,7 +171,7 @@ again:
int index = (root->log_transid + 1) % 2;
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -194,7 +194,7 @@ again:
* writing.
*/
if (zoned && !created) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -2287,7 +2287,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_key location;
/*
- * Currenly we only log dir index keys. Even if we replay a log created
+ * Currently we only log dir index keys. Even if we replay a log created
* by an older kernel that logged both dir index and dir item keys, all
* we need to do is process the dir index keys, we (and our caller) can
* safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
@@ -3121,7 +3121,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -3222,7 +3222,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -3261,7 +3261,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_wake_log_root;
}
@@ -5848,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
inode_only == LOG_INODE_ALL &&
inode->last_unlink_trans >= trans->transid) {
btrfs_set_log_full_commit(trans);
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_unlock;
}
@@ -6562,12 +6562,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
bool log_dentries = false;
if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto end_no_trans;
}
if (btrfs_root_refs(&root->root_item) == 0) {
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto end_no_trans;
}
@@ -6665,7 +6665,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
if (ret)
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1620f8170629..57ab5f3b8dc7 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -12,6 +12,9 @@
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
+/* We can't use the tree log for whatever reason, force a transaction commit */
+#define BTRFS_LOG_FORCE_COMMIT (1)
+
struct btrfs_log_ctx {
int log_ret;
int log_transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9c20049d1fec..6aa6bc769569 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -182,6 +182,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags)
return btrfs_raid_array[index].raid_name;
}
+int btrfs_nr_parity_stripes(u64 type)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
+
+ return btrfs_raid_array[index].nparity;
+}
+
/*
* Fill @buf with textual description of @bg_flags, no more than @size_buf
* bytes including terminating null byte.
@@ -1396,12 +1403,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
{
switch (device->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
- /*
- * We don't want to overwrite the superblock on the drive nor
- * any area used by the boot loader (grub for example), so we
- * make sure to start at an offset of at least 1MB.
- */
- return max_t(u64, start, SZ_1M);
+ return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
case BTRFS_CHUNK_ALLOC_ZONED:
/*
* We don't care about the starting region like regular
@@ -5071,26 +5073,16 @@ static void init_alloc_chunk_ctl_policy_regular(
struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
- u64 type = ctl->type;
+ struct btrfs_space_info *space_info;
- if (type & BTRFS_BLOCK_GROUP_DATA) {
- ctl->max_stripe_size = SZ_1G;
- ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
- } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- /* For larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
- ctl->max_stripe_size = SZ_1G;
- else
- ctl->max_stripe_size = SZ_256M;
- ctl->max_chunk_size = ctl->max_stripe_size;
- } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ctl->max_stripe_size = SZ_32M;
- ctl->max_chunk_size = 2 * ctl->max_stripe_size;
- ctl->devs_max = min_t(int, ctl->devs_max,
- BTRFS_MAX_DEVS_SYS_CHUNK);
- } else {
- BUG();
- }
+ space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
+ ASSERT(space_info);
+
+ ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
+ ctl->max_stripe_size = ctl->max_chunk_size;
+
+ if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
+ ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
/* We don't want a chunk larger than 10% of writable space */
ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
@@ -5720,7 +5712,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct extent_map *em;
struct map_lookup *map;
- int ret;
+ enum btrfs_raid_types index;
+ int ret = 1;
em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
@@ -5733,10 +5726,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
map = em->map_lookup;
- if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
- ret = map->num_stripes;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- ret = map->sub_stripes;
+ index = btrfs_bg_flags_to_raid_index(map->type);
+
+ /* Non-RAID56, use their ncopies from btrfs_raid_array. */
+ if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ ret = btrfs_raid_array[index].ncopies;
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
@@ -5748,8 +5742,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* stripe under reconstruction.
*/
ret = map->num_stripes;
- else
- ret = 1;
free_extent_map(em);
down_read(&fs_info->dev_replace.rwsem);
@@ -5768,6 +5760,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return len;
+
em = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(em))) {
@@ -5785,6 +5780,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
struct map_lookup *map;
int ret = 0;
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return 0;
+
em = btrfs_get_chunk_map(fs_info, logical, len);
if(!WARN_ON(IS_ERR(em))) {
@@ -5893,7 +5891,6 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
- atomic_set(&bioc->error, 0);
refcount_set(&bioc->refs, 1);
bioc->fs_info = fs_info;
@@ -5917,18 +5914,17 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc)
kfree(bioc);
}
-/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
* Please note that, discard won't be sent to target device of device
* replace.
*/
-static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
- u64 logical, u64 *length_ret,
- struct btrfs_io_context **bioc_ret)
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_io_context *bioc;
+ struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
@@ -5937,29 +5933,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 stripe_cnt;
u64 stripe_len;
u64 stripe_offset;
- u64 num_stripes;
u32 stripe_index;
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
- int ret = 0;
+ int ret;
int i;
- /* Discard always returns a bioc. */
- ASSERT(bioc_ret);
-
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
- return PTR_ERR(em);
+ return ERR_CAST(em);
map = em->map_lookup;
+
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
- goto out;
- }
+ goto out_free_map;
+}
offset = logical - em->start;
length = min_t(u64, em->start + em->len - logical, length);
@@ -5985,7 +5978,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- num_stripes = 1;
+ *num_stripes = 1;
stripe_index = 0;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
@@ -5995,7 +5988,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
sub_stripes = map->sub_stripes;
factor = map->num_stripes / sub_stripes;
- num_stripes = min_t(u64, map->num_stripes,
+ *num_stripes = min_t(u64, map->num_stripes,
sub_stripes * stripe_cnt);
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= sub_stripes;
@@ -6005,31 +5998,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
last_stripe *= sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
- num_stripes = map->num_stripes;
+ *num_stripes = map->num_stripes;
} else {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
}
- bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
- if (!bioc) {
+ stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
+ if (!stripes) {
ret = -ENOMEM;
- goto out;
+ goto out_free_map;
}
- for (i = 0; i < num_stripes; i++) {
- bioc->stripes[i].physical =
+ for (i = 0; i < *num_stripes; i++) {
+ stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+ stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bioc->stripes[i].length = stripes_per_dev *
- map->stripe_len;
+ stripes[i].length = stripes_per_dev * map->stripe_len;
if (i / sub_stripes < remaining_stripes)
- bioc->stripes[i].length += map->stripe_len;
+ stripes[i].length += map->stripe_len;
/*
* Special for the first stripe and
@@ -6040,17 +6032,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bioc->stripes[i].length -= stripe_offset;
+ stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bioc->stripes[i].length -= stripe_end_offset;
+ stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bioc->stripes[i].length = length;
+ stripes[i].length = length;
}
stripe_index++;
@@ -6060,12 +6052,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bioc_ret = bioc;
- bioc->map_type = map->type;
- bioc->num_stripes = num_stripes;
-out:
free_extent_map(em);
- return ret;
+ return stripes;
+out_free_map:
+ free_extent_map(em);
+ return ERR_PTR(ret);
}
/*
@@ -6208,7 +6199,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->stripes + i;
new->physical = old->physical;
- new->length = old->length;
new->dev = dev_replace->tgtdev;
bioc->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
@@ -6249,8 +6239,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bioc->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
bioc->tgtdev_map[index_srcdev] = num_stripes;
@@ -6472,6 +6460,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
@@ -6479,9 +6468,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
- max_errors = nr_parity_stripes(map);
+ max_errors = btrfs_chunk_max_errors(map);
- *length = map->stripe_len;
+ /* Return the length to the full stripe end */
+ *length = min(logical + *length,
+ raid56_full_stripe_start + em->start +
+ data_stripes * stripe_len) - logical;
stripe_index = 0;
stripe_offset = 0;
} else {
@@ -6604,10 +6596,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret, int mirror_num)
{
- if (op == BTRFS_MAP_DISCARD)
- return __btrfs_map_block_for_discard(fs_info, logical,
- length, bioc_ret);
-
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
mirror_num, 0);
}
@@ -6620,79 +6608,108 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
}
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
{
- bio->bi_private = bioc->private;
- bio->bi_end_io = bioc->end_io;
- bio_endio(bio);
+ if (bioc->orig_bio->bi_opf & REQ_META)
+ return bioc->fs_info->endio_meta_workers;
+ return bioc->fs_info->endio_workers;
+}
- btrfs_put_bioc(bioc);
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio =
+ container_of(work, struct btrfs_bio, end_io_work);
+
+ bbio->bio.bi_end_io(&bbio->bio);
}
static void btrfs_end_bio(struct bio *bio)
{
- struct btrfs_io_context *bioc = bio->bi_private;
- int is_orig_bio = 0;
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
+ struct bio *orig_bio = bioc->orig_bio;
+ struct btrfs_bio *bbio = btrfs_bio(orig_bio);
if (bio->bi_status) {
atomic_inc(&bioc->error);
- if (bio->bi_status == BLK_STS_IOERR ||
- bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_bio(bio)->device;
-
- ASSERT(dev->bdev);
+ if (stripe->dev && stripe->dev->bdev &&
+ (bio->bi_status == BLK_STS_IOERR ||
+ bio->bi_status == BLK_STS_TARGET)) {
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
}
}
- if (bio == bioc->orig_bio)
- is_orig_bio = 1;
-
btrfs_bio_counter_dec(bioc->fs_info);
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- if (!is_orig_bio) {
- bio_put(bio);
- bio = bioc->orig_bio;
- }
-
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- /* only send an error to the higher layers if it is
- * beyond the tolerance of the btrfs bio
- */
- if (atomic_read(&bioc->error) > bioc->max_errors) {
- bio->bi_status = BLK_STS_IOERR;
- } else {
- /*
- * this bio is actually up to date, we didn't
- * go over the max number of errors
- */
- bio->bi_status = BLK_STS_OK;
- }
-
- btrfs_end_bioc(bioc, bio);
- } else if (!is_orig_bio) {
+ if (bio != orig_bio) {
+ bio_endio(orig_bio);
bio_put(bio);
+ return;
+ }
+
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ orig_bio->bi_status = BLK_STS_IOERR;
+ else
+ orig_bio->bi_status = BLK_STS_OK;
+
+ bbio->mirror_num = bioc->mirror_num;
+ orig_bio->bi_end_io = bioc->end_io;
+ orig_bio->bi_private = bioc->private;
+ if (btrfs_op(orig_bio) == BTRFS_MAP_READ) {
+ bbio->device = stripe->dev;
+ INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
+ } else {
+ orig_bio->bi_end_io(orig_bio);
}
+
+ btrfs_put_bioc(bioc);
}
-static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
- u64 physical, struct btrfs_device *dev)
+static void submit_stripe_bio(struct btrfs_io_context *bioc,
+ struct bio *orig_bio, int dev_nr, bool clone)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
+ struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
+ u64 physical = bioc->stripes[dev_nr].physical;
+ struct bio *bio;
+
+ if (clone) {
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ bio_inc_remaining(orig_bio);
+ } else {
+ bio = orig_bio;
+ }
- bio->bi_private = bioc;
- btrfs_bio(bio)->device = dev;
+ bioc->stripes[dev_nr].bioc = bioc;
+ bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
+
+ btrfs_bio_counter_inc_noblocked(fs_info);
+
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+ bio_io_error(bio);
+ return;
+ }
+
+ bio_set_dev(bio, dev->bdev);
+
/*
* For zone append writing, bi_sector must point the beginning of the
* zone
@@ -6708,77 +6725,47 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
}
}
btrfs_debug_in_rcu(fs_info,
- "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
+ "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+ __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
- btrfs_bio_counter_inc_noblocked(fs_info);
-
btrfsic_check_bio(bio);
submit_bio(bio);
}
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
-{
- atomic_inc(&bioc->error);
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- /* Should be the original bio. */
- WARN_ON(bio != bioc->orig_bio);
-
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bioc->error) > bioc->max_errors)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_status = BLK_STS_OK;
- btrfs_end_bioc(bioc, bio);
- }
-}
-
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num)
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
{
- struct btrfs_device *dev;
- struct bio *first_bio = bio;
u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = 0;
- u64 map_length;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
int ret;
int dev_nr;
int total_devs;
struct btrfs_io_context *bioc = NULL;
- length = bio->bi_iter.bi_size;
- map_length = length;
-
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
&map_length, &bioc, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+ return;
}
total_devs = bioc->num_stripes;
- bioc->orig_bio = first_bio;
- bioc->private = first_bio->bi_private;
- bioc->end_io = first_bio->bi_end_io;
- atomic_set(&bioc->stripes_pending, bioc->num_stripes);
+ bioc->orig_bio = bio;
+ bioc->private = bio->bi_private;
+ bioc->end_io = bio->bi_end_io;
if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
- /* In this case, map_length has been set to the length of
- a single stripe; not the whole write */
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- ret = raid56_parity_write(bio, bioc, map_length);
- } else {
- ret = raid56_parity_recover(bio, bioc, map_length,
- mirror_num, 1);
- }
-
- btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ raid56_parity_write(bio, bioc);
+ else
+ raid56_parity_recover(bio, bioc, mirror_num, true);
+ return;
}
if (map_length < length) {
@@ -6789,26 +6776,11 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bioc->stripes[dev_nr].dev;
- if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
- &dev->dev_state) ||
- (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bioc_error(bioc, first_bio, logical);
- continue;
- }
+ const bool should_clone = (dev_nr < total_devs - 1);
- if (dev_nr < total_devs - 1) {
- bio = btrfs_bio_clone(dev->bdev, first_bio);
- } else {
- bio = first_bio;
- bio_set_dev(bio, dev->bdev);
- }
-
- submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
+ submit_stripe_bio(bioc, bio, dev_nr, should_clone);
}
btrfs_bio_counter_dec(fs_info);
- return BLK_STS_OK;
}
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
@@ -6966,11 +6938,12 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+u64 btrfs_calc_stripe_length(const struct extent_map *em)
{
- const int data_stripes = calc_data_stripes(type, num_stripes);
+ const struct map_lookup *map = em->map_lookup;
+ const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(chunk_len, data_stripes);
+ return div_u64(em->len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -7109,8 +7082,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->type = type;
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
- em->orig_block_len = calc_stripe_length(type, em->len,
- map->num_stripes);
+ em->orig_block_len = btrfs_calc_stripe_length(em);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -8011,7 +7983,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
map = em->map_lookup;
- stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
+ stripe_len = btrfs_calc_stripe_length(em);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
@@ -8021,6 +7993,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
+ /*
+ * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * space. Although kernel can handle it without problem, better to warn
+ * the users.
+ */
+ if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
+ btrfs_warn(fs_info,
+ "devid %llu physical %llu len %llu inside the reserved space",
+ devid, physical_offset, physical_len);
+
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->devid == devid &&
map->stripes[i].physical == physical_offset) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6721002000ee..4324c4d40909 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -355,6 +355,13 @@ struct btrfs_fs_devices {
/ sizeof(struct btrfs_stripe) + 1)
/*
+ * Maximum number of sectors for a single bio to limit the size of the
+ * checksum array. This matches the number of bio_vecs per bio and thus the
+ * I/O size for buffered I/O.
+ */
+#define BTRFS_MAX_BIO_SECTORS (256)
+
+/*
* Additional info to pass along bio.
*
* Mostly for btrfs specific features like csum and mirror_num.
@@ -371,6 +378,9 @@ struct btrfs_bio {
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+ /* For read end I/O handling */
+ struct work_struct end_io_work;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
@@ -391,10 +401,36 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
}
}
+/*
+ * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+ *
+ * bvl - struct bio_vec
+ * bbio - struct btrfs_bio
+ * iters - struct bvec_iter
+ * bio_offset - unsigned int
+ */
+#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
+ for ((iter) = (bbio)->iter, (bio_offset) = 0; \
+ (iter).bi_size && \
+ (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
+ (bio_offset) += fs_info->sectorsize, \
+ bio_advance_iter_single(&(bbio)->bio, &(iter), \
+ (fs_info)->sectorsize))
+
struct btrfs_io_stripe {
struct btrfs_device *dev;
+ union {
+ /* Block mapping */
+ u64 physical;
+ /* For the endio handler */
+ struct btrfs_io_context *bioc;
+ };
+};
+
+struct btrfs_discard_stripe {
+ struct btrfs_device *dev;
u64 physical;
- u64 length; /* only used for discard mappings */
+ u64 length;
};
/*
@@ -415,7 +451,6 @@ struct btrfs_io_stripe {
*/
struct btrfs_io_context {
refcount_t refs;
- atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
@@ -533,6 +568,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret);
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
@@ -541,8 +579,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num);
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
@@ -601,6 +638,8 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
+u64 btrfs_calc_stripe_length(const struct extent_map *em);
+int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 79e8c8cd75ed..7a0f8fa44800 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -94,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
* Possible states of log buffer zones
*
* Empty[0] In use[0] Full[0]
- * Empty[1] * x 0
- * In use[1] 0 x 0
- * Full[1] 1 1 C
+ * Empty[1] * 0 1
+ * In use[1] x x 1
+ * Full[1] 0 0 C
*
* Log position:
* *: Special case, no superblock is written
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 0fe31a6f6e68..5d2ab0bac9d2 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -391,6 +391,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*out_pages = 0;
*total_out = 0;
*total_in = 0;
+ workspace->in_buf.src = NULL;
+ workspace->out_buf.dst = NULL;
/* Initialize the stream */
stream = zstd_init_cstream(&params, len, workspace->mem,
@@ -403,7 +405,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = kmap_local_page(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
@@ -415,7 +417,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = kmap_local_page(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -450,9 +452,9 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
+ kunmap_local(workspace->out_buf.dst);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
+ workspace->out_buf.dst = NULL;
ret = -E2BIG;
goto out;
}
@@ -462,7 +464,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = kmap_local_page(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -477,15 +479,16 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
- kunmap(in_page);
+ kunmap_local(workspace->out_buf.dst);
+ kunmap_local(workspace->in_buf.src);
put_page(in_page);
-
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = kmap_local_page(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ workspace->out_buf.dst = kmap_local_page(out_page);
}
}
while (1) {
@@ -510,9 +513,9 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
+ kunmap_local(workspace->out_buf.dst);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
+ workspace->out_buf.dst = NULL;
ret = -E2BIG;
goto out;
}
@@ -522,7 +525,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = kmap_local_page(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -538,12 +541,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
out:
*out_pages = nr_pages;
/* Cleanup */
- if (in_page) {
- kunmap(in_page);
+ if (workspace->out_buf.dst)
+ kunmap_local(workspace->out_buf.dst);
+ if (workspace->in_buf.src) {
+ kunmap_local(workspace->in_buf.src);
put_page(in_page);
}
- if (out_page)
- kunmap(out_page);
return ret;
}
@@ -567,7 +570,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -603,14 +606,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- kunmap(pages_in[page_in_index++]);
+ kunmap_local(workspace->in_buf.src);
+ page_in_index++;
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -619,7 +623,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
zero_fill_bio(cb->orig_bio);
done:
if (workspace->in_buf.src)
- kunmap(pages_in[page_in_index]);
+ kunmap_local(workspace->in_buf.src);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 898c7f301b1b..5717d1881d2f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -282,10 +282,10 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
- * If none of the buffers had errors and they are all
- * uptodate then we can set the page uptodate.
+ * If all of the buffers are uptodate then we can set the page
+ * uptodate.
*/
- if (page_uptodate && !PageError(page))
+ if (page_uptodate)
SetPageUptodate(page);
unlock_page(page);
return;
@@ -1604,7 +1604,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
pgoff_t end;
int i, count;
@@ -1612,24 +1612,24 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
struct buffer_head *head;
end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
- pagevec_init(&pvec);
- while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
- count = pagevec_count(&pvec);
+ folio_batch_init(&fbatch);
+ while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
+ count = folio_batch_count(&fbatch);
for (i = 0; i < count; i++) {
- struct page *page = pvec.pages[i];
+ struct folio *folio = fbatch.folios[i];
- if (!page_has_buffers(page))
+ if (!folio_buffers(folio))
continue;
/*
- * We use page lock instead of bd_mapping->private_lock
+ * We use folio lock instead of bd_mapping->private_lock
* to pin buffers here since we can afford to sleep and
* it scales better than a global spinlock lock.
*/
- lock_page(page);
- /* Recheck when the page is locked which pins bhs */
- if (!page_has_buffers(page))
+ folio_lock(folio);
+ /* Recheck when the folio is locked which pins bhs */
+ head = folio_buffers(folio);
+ if (!head)
goto unlock_page;
- head = page_buffers(page);
bh = head;
do {
if (!buffer_mapped(bh) || (bh->b_blocknr < block))
@@ -1643,9 +1643,9 @@ next:
bh = bh->b_this_page;
} while (bh != head);
unlock_page:
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
/* End of range already reached? */
if (index > end || !index)
@@ -2259,6 +2259,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
unsigned int blocksize, bbits;
int nr, i;
int fully_mapped = 1;
+ bool page_error = false;
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
@@ -2283,8 +2284,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (iblock < lblock) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, iblock, bh, 0);
- if (err)
+ if (err) {
folio_set_error(folio);
+ page_error = true;
+ }
}
if (!buffer_mapped(bh)) {
folio_zero_range(folio, i * blocksize,
@@ -2311,7 +2314,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
* All buffers are uptodate - we can set the folio uptodate
* as well. But not if get_block() returned an error.
*/
- if (!folio_test_error(folio))
+ if (!page_error)
folio_mark_uptodate(folio);
folio_unlock(folio);
return 0;
@@ -2534,330 +2537,6 @@ out_unlock:
}
EXPORT_SYMBOL(block_page_mkwrite);
-/*
- * nobh_write_begin()'s prereads are special: the buffer_heads are freed
- * immediately, while under the page lock. So it needs a special end_io
- * handler which does not touch the bh after unlocking it.
- */
-static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
-{
- __end_buffer_read_notouch(bh, uptodate);
-}
-
-/*
- * Attach the singly-linked list of buffers created by nobh_write_begin, to
- * the page (converting it to circular linked list and taking care of page
- * dirty races).
- */
-static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
-{
- struct buffer_head *bh;
-
- BUG_ON(!PageLocked(page));
-
- spin_lock(&page->mapping->private_lock);
- bh = head;
- do {
- if (PageDirty(page))
- set_buffer_dirty(bh);
- if (!bh->b_this_page)
- bh->b_this_page = head;
- bh = bh->b_this_page;
- } while (bh != head);
- attach_page_private(page, head);
- spin_unlock(&page->mapping->private_lock);
-}
-
-/*
- * On entry, the page is fully not uptodate.
- * On exit the page is fully uptodate in the areas outside (from,to)
- * The filesystem needs to handle block truncation upon failure.
- */
-int nobh_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
- struct page **pagep, void **fsdata,
- get_block_t *get_block)
-{
- struct inode *inode = mapping->host;
- const unsigned blkbits = inode->i_blkbits;
- const unsigned blocksize = 1 << blkbits;
- struct buffer_head *head, *bh;
- struct page *page;
- pgoff_t index;
- unsigned from, to;
- unsigned block_in_page;
- unsigned block_start, block_end;
- sector_t block_in_file;
- int nr_reads = 0;
- int ret = 0;
- int is_mapped_to_disk = 1;
-
- index = pos >> PAGE_SHIFT;
- from = pos & (PAGE_SIZE - 1);
- to = from + len;
-
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
- *fsdata = NULL;
-
- if (page_has_buffers(page)) {
- ret = __block_write_begin(page, pos, len, get_block);
- if (unlikely(ret))
- goto out_release;
- return ret;
- }
-
- if (PageMappedToDisk(page))
- return 0;
-
- /*
- * Allocate buffers so that we can keep track of state, and potentially
- * attach them to the page if an error occurs. In the common case of
- * no error, they will just be freed again without ever being attached
- * to the page (which is all OK, because we're under the page lock).
- *
- * Be careful: the buffer linked list is a NULL terminated one, rather
- * than the circular one we're used to.
- */
- head = alloc_page_buffers(page, blocksize, false);
- if (!head) {
- ret = -ENOMEM;
- goto out_release;
- }
-
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
-
- /*
- * We loop across all blocks in the page, whether or not they are
- * part of the affected region. This is so we can discover if the
- * page is fully mapped-to-disk.
- */
- for (block_start = 0, block_in_page = 0, bh = head;
- block_start < PAGE_SIZE;
- block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
- int create;
-
- block_end = block_start + blocksize;
- bh->b_state = 0;
- create = 1;
- if (block_start >= to)
- create = 0;
- ret = get_block(inode, block_in_file + block_in_page,
- bh, create);
- if (ret)
- goto failed;
- if (!buffer_mapped(bh))
- is_mapped_to_disk = 0;
- if (buffer_new(bh))
- clean_bdev_bh_alias(bh);
- if (PageUptodate(page)) {
- set_buffer_uptodate(bh);
- continue;
- }
- if (buffer_new(bh) || !buffer_mapped(bh)) {
- zero_user_segments(page, block_start, from,
- to, block_end);
- continue;
- }
- if (buffer_uptodate(bh))
- continue; /* reiserfs does this */
- if (block_start < from || block_end > to) {
- lock_buffer(bh);
- bh->b_end_io = end_buffer_read_nobh;
- submit_bh(REQ_OP_READ, 0, bh);
- nr_reads++;
- }
- }
-
- if (nr_reads) {
- /*
- * The page is locked, so these buffers are protected from
- * any VM or truncate activity. Hence we don't need to care
- * for the buffer_head refcounts.
- */
- for (bh = head; bh; bh = bh->b_this_page) {
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- ret = -EIO;
- }
- if (ret)
- goto failed;
- }
-
- if (is_mapped_to_disk)
- SetPageMappedToDisk(page);
-
- *fsdata = head; /* to be released by nobh_write_end */
-
- return 0;
-
-failed:
- BUG_ON(!ret);
- /*
- * Error recovery is a bit difficult. We need to zero out blocks that
- * were newly allocated, and dirty them to ensure they get written out.
- * Buffers need to be attached to the page at this point, otherwise
- * the handling of potential IO errors during writeout would be hard
- * (could try doing synchronous writeout, but what if that fails too?)
- */
- attach_nobh_buffers(page, head);
- page_zero_new_buffers(page, from, to);
-
-out_release:
- unlock_page(page);
- put_page(page);
- *pagep = NULL;
-
- return ret;
-}
-EXPORT_SYMBOL(nobh_write_begin);
-
-int nobh_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = page->mapping->host;
- struct buffer_head *head = fsdata;
- struct buffer_head *bh;
- BUG_ON(fsdata != NULL && page_has_buffers(page));
-
- if (unlikely(copied < len) && head)
- attach_nobh_buffers(page, head);
- if (page_has_buffers(page))
- return generic_write_end(file, mapping, pos, len,
- copied, page, fsdata);
-
- SetPageUptodate(page);
- set_page_dirty(page);
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- unlock_page(page);
- put_page(page);
-
- while (head) {
- bh = head;
- head = head->b_this_page;
- free_buffer_head(bh);
- }
-
- return copied;
-}
-EXPORT_SYMBOL(nobh_write_end);
-
-/*
- * nobh_writepage() - based on block_full_write_page() except
- * that it tries to operate without attaching bufferheads to
- * the page.
- */
-int nobh_writepage(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc)
-{
- struct inode * const inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_SHIFT;
- unsigned offset;
- int ret;
-
- /* Is the page fully inside i_size? */
- if (page->index < end_index)
- goto out;
-
- /* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_SIZE-1);
- if (page->index >= end_index+1 || !offset) {
- unlock_page(page);
- return 0; /* don't care */
- }
-
- /*
- * The page straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
- * in multiples of the page size. For a file that is not a multiple of
- * the page size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- zero_user_segment(page, offset, PAGE_SIZE);
-out:
- ret = mpage_writepage(page, get_block, wbc);
- if (ret == -EAGAIN)
- ret = __block_write_full_page(inode, page, get_block, wbc,
- end_buffer_async_write);
- return ret;
-}
-EXPORT_SYMBOL(nobh_writepage);
-
-int nobh_truncate_page(struct address_space *mapping,
- loff_t from, get_block_t *get_block)
-{
- pgoff_t index = from >> PAGE_SHIFT;
- struct inode *inode = mapping->host;
- unsigned blocksize = i_blocksize(inode);
- struct folio *folio;
- struct buffer_head map_bh;
- size_t offset;
- sector_t iblock;
- int err;
-
- /* Block boundary? Nothing to do */
- if (!(from & (blocksize - 1)))
- return 0;
-
- folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_CREAT,
- mapping_gfp_mask(mapping));
- err = -ENOMEM;
- if (!folio)
- goto out;
-
- if (folio_buffers(folio))
- goto has_buffers;
-
- iblock = from >> inode->i_blkbits;
- map_bh.b_size = blocksize;
- map_bh.b_state = 0;
- err = get_block(inode, iblock, &map_bh, 0);
- if (err)
- goto unlock;
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(&map_bh))
- goto unlock;
-
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (!folio_test_uptodate(folio)) {
- err = mapping->a_ops->read_folio(NULL, folio);
- if (err) {
- folio_put(folio);
- goto out;
- }
- folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
- err = -EIO;
- goto unlock;
- }
- if (folio_buffers(folio))
- goto has_buffers;
- }
- offset = offset_in_folio(folio, from);
- folio_zero_segment(folio, offset, round_up(offset, blocksize));
- folio_mark_dirty(folio);
- err = 0;
-
-unlock:
- folio_unlock(folio);
- folio_put(folio);
-out:
- return err;
-
-has_buffers:
- folio_unlock(folio);
- folio_put(folio);
- return block_truncate_page(mapping, from, get_block);
-}
-EXPORT_SYMBOL(nobh_truncate_page);
-
int block_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 8adf81042498..ccdbec388091 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -22,25 +22,24 @@
static int coda_symlink_filler(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
struct inode *inode = folio->mapping->host;
int error;
struct coda_inode_info *cii;
unsigned int len = PAGE_SIZE;
- char *p = page_address(page);
+ char *p = folio_address(folio);
cii = ITOC(inode);
error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
if (error)
goto fail;
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_set_error(folio);
+ folio_unlock(folio);
return error;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 7ae59a6afc5c..61ccf7722fc3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -183,6 +183,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
unsigned int len)
{
struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct file_ra_state ra;
struct page *pages[BLKS_PER_BUF];
unsigned i, blocknr, buffer;
unsigned long devsize;
@@ -212,6 +213,9 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
/* Ok, read in BLKS_PER_BUF pages completely first. */
+ file_ra_state_init(&ra, mapping);
+ page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF);
+
for (i = 0; i < BLKS_PER_BUF; i++) {
struct page *page = NULL;
@@ -224,19 +228,6 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
pages[i] = page;
}
- for (i = 0; i < BLKS_PER_BUF; i++) {
- struct page *page = pages[i];
-
- if (page) {
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- /* asynchronous error */
- put_page(page);
- pages[i] = NULL;
- }
- }
- }
-
buffer = next_buffer;
next_buffer = NEXT_BUFFER(buffer);
buffer_blocknr[buffer] = blocknr;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 14e0ef5e9a20..12bd61d20f69 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -86,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
/**
* fscrypt_fname_encrypt() - encrypt a filename
* @inode: inode of the parent directory (for regular filenames)
- * or of the symlink (for symlink targets)
+ * or of the symlink (for symlink targets). Key must already be
+ * set up.
* @iname: the filename to encrypt
* @out: (output) the encrypted filename
* @olen: size of the encrypted filename. It must be at least @iname->len.
@@ -137,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
return 0;
}
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);
/**
* fname_decrypt() - decrypt a filename
@@ -264,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
return bp - dst;
}
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
- u32 orig_len, u32 max_len,
- u32 *encrypted_len_ret)
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret)
{
int padding = 4 << (fscrypt_policy_flags(policy) &
FSCRYPT_POLICY_FLAGS_PAD_MASK);
@@ -281,6 +283,29 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
}
/**
+ * fscrypt_fname_encrypted_size() - calculate length of encrypted filename
+ * @inode: parent inode of dentry name being encrypted. Key must
+ * already be set up.
+ * @orig_len: length of the original filename
+ * @max_len: maximum length to return
+ * @encrypted_len_ret: where calculated length should be returned (on success)
+ *
+ * Filenames that are shorter than the maximum length may have their lengths
+ * increased slightly by encryption, due to padding that is applied.
+ *
+ * Return: false if the orig_len is greater than max_len. Otherwise, true and
+ * fill out encrypted_len_ret with the length (up to max_len).
+ */
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+ u32 max_len, u32 *encrypted_len_ret)
+{
+ return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
+ orig_len, max_len,
+ encrypted_len_ret);
+}
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
+
+/**
* fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
* @max_encrypted_len: maximum length of encrypted filenames the buffer will be
* used to present
@@ -435,8 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return ret;
if (fscrypt_has_encryption_key(dir)) {
- if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
- iname->len, NAME_MAX,
+ if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 6b4c8094cc7b..3afdaa084773 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -31,7 +31,7 @@
#define FSCRYPT_CONTEXT_V2 2
/* Keep this in sync with include/uapi/linux/fscrypt.h */
-#define FSCRYPT_MODE_MAX FSCRYPT_MODE_ADIANTUM
+#define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2
struct fscrypt_context_v1 {
u8 version; /* FSCRYPT_CONTEXT_V1 */
@@ -297,14 +297,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci);
/* fname.c */
-int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
- u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
- u32 orig_len, u32 max_len,
- u32 *encrypted_len_ret);
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret);
/* hkdf.c */
-
struct fscrypt_hkdf {
struct crypto_shash *hmac_tfm;
};
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index af74599ae1cf..7c01025879b3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -228,9 +228,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target,
* counting it (even though it is meaningless for ciphertext) is simpler
* for now since filesystems will assume it is there and subtract it.
*/
- if (!fscrypt_fname_encrypted_size(policy, len,
- max_len - sizeof(struct fscrypt_symlink_data),
- &disk_link->len))
+ if (!__fscrypt_fname_encrypted_size(policy, len,
+ max_len - sizeof(struct fscrypt_symlink_data),
+ &disk_link->len))
return -ENAMETOOLONG;
disk_link->len += sizeof(struct fscrypt_symlink_data);
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index c35711896bd4..fbc71abdabe3 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -53,6 +53,13 @@ struct fscrypt_mode fscrypt_modes[] = {
.ivsize = 32,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
},
+ [FSCRYPT_MODE_AES_256_HCTR2] = {
+ .friendly_name = "AES-256-HCTR2",
+ .cipher_str = "hctr2(aes)",
+ .keysize = 32,
+ .security_strength = 32,
+ .ivsize = 32,
+ },
};
static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 5f858cee1e3b..80b8ca0f340b 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -61,7 +61,7 @@ fscrypt_get_dummy_policy(struct super_block *sb)
return sb->s_cop->get_dummy_policy(sb);
}
-static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
+static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
filenames_mode == FSCRYPT_MODE_AES_256_CTS)
@@ -78,6 +78,14 @@ static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
return false;
}
+static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
+{
+ if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+ filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
+ return true;
+ return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
+}
+
static bool supported_direct_key_modes(const struct inode *inode,
u32 contents_mode, u32 filenames_mode)
{
@@ -151,7 +159,7 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy,
const struct inode *inode)
{
- if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode,
policy->filenames_encryption_mode)) {
fscrypt_warn(inode,
"Unsupported encryption modes (contents %d, filenames %d)",
@@ -187,7 +195,7 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
{
int count = 0;
- if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode,
policy->filenames_encryption_mode)) {
fscrypt_warn(inode,
"Unsupported encryption modes (contents %d, filenames %d)",
@@ -686,6 +694,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
}
/**
+ * fscrypt_context_for_new_inode() - create an encryption context for a new inode
+ * @ctx: where context should be written
+ * @inode: inode from which to fetch policy and nonce
+ *
+ * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode,
+ * generate a new context and write it to ctx. ctx _must_ be at least
+ * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes.
+ *
+ * Return: size of the resulting context or a negative error code.
+ */
+int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
+{
+ struct fscrypt_info *ci = inode->i_crypt_info;
+
+ BUILD_BUG_ON(sizeof(union fscrypt_context) !=
+ FSCRYPT_SET_CONTEXT_MAX_SIZE);
+
+ /* fscrypt_prepare_new_inode() should have set up the key already. */
+ if (WARN_ON_ONCE(!ci))
+ return -ENOKEY;
+
+ return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce);
+}
+EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
+
+/**
* fscrypt_set_context() - Set the fscrypt context of a new inode
* @inode: a new inode
* @fs_data: private data given by FS and passed to ->set_context()
@@ -701,12 +735,9 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
union fscrypt_context ctx;
int ctxsize;
- /* fscrypt_prepare_new_inode() should have set up the key already. */
- if (WARN_ON_ONCE(!ci))
- return -ENOKEY;
-
- BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
- ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+ ctxsize = fscrypt_context_for_new_inode(&ctx, inode);
+ if (ctxsize < 0)
+ return ctxsize;
/*
* This may be the first time the inode number is available, so do any
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index ee92634196a8..1105ce3c80cb 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -9,6 +9,15 @@ menuconfig DLM
A general purpose distributed lock manager for kernel or userspace
applications.
+config DLM_DEPRECATED_API
+ bool "DLM deprecated API"
+ depends on DLM
+ help
+ Enables deprecated DLM timeout features that will be removed in
+ later Linux kernel releases.
+
+ If you are unsure, say N.
+
config DLM_DEBUG
bool "DLM debugging"
depends on DLM
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 3545fdafc6fb..71dab733cf9a 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -9,7 +9,6 @@ dlm-y := ast.o \
member.o \
memory.o \
midcomms.o \
- netlink.o \
lowcomms.o \
plock.o \
rcom.o \
@@ -18,5 +17,6 @@ dlm-y := ast.o \
requestqueue.o \
user.o \
util.o
+dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o
dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index bfac462dd3e8..19ef136f9e4f 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -255,13 +255,13 @@ void dlm_callback_work(struct work_struct *work)
if (callbacks[i].flags & DLM_CB_SKIP) {
continue;
} else if (callbacks[i].flags & DLM_CB_BAST) {
- bastfn(lkb->lkb_astparam, callbacks[i].mode);
trace_dlm_bast(ls, lkb, callbacks[i].mode);
+ bastfn(lkb->lkb_astparam, callbacks[i].mode);
} else if (callbacks[i].flags & DLM_CB_CAST) {
lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
+ trace_dlm_ast(ls, lkb);
castfn(lkb->lkb_astparam);
- trace_dlm_ast(ls, lkb, lkb->lkb_lksb);
}
}
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 42eee2783756..ac8b62106ce0 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -75,8 +75,9 @@ struct dlm_cluster {
unsigned int cl_log_info;
unsigned int cl_protocol;
unsigned int cl_mark;
+#ifdef CONFIG_DLM_DEPRECATED_API
unsigned int cl_timewarn_cs;
- unsigned int cl_waitwarn_us;
+#endif
unsigned int cl_new_rsb_count;
unsigned int cl_recover_callbacks;
char cl_cluster_name[DLM_LOCKSPACE_LEN];
@@ -102,8 +103,9 @@ enum {
CLUSTER_ATTR_LOG_INFO,
CLUSTER_ATTR_PROTOCOL,
CLUSTER_ATTR_MARK,
+#ifdef CONFIG_DLM_DEPRECATED_API
CLUSTER_ATTR_TIMEWARN_CS,
- CLUSTER_ATTR_WAITWARN_US,
+#endif
CLUSTER_ATTR_NEW_RSB_COUNT,
CLUSTER_ATTR_RECOVER_CALLBACKS,
CLUSTER_ATTR_CLUSTER_NAME,
@@ -224,8 +226,9 @@ CLUSTER_ATTR(log_debug, NULL);
CLUSTER_ATTR(log_info, NULL);
CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running);
CLUSTER_ATTR(mark, NULL);
+#ifdef CONFIG_DLM_DEPRECATED_API
CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
-CLUSTER_ATTR(waitwarn_us, NULL);
+#endif
CLUSTER_ATTR(new_rsb_count, NULL);
CLUSTER_ATTR(recover_callbacks, NULL);
@@ -240,8 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info,
[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
[CLUSTER_ATTR_MARK] = &cluster_attr_mark,
+#ifdef CONFIG_DLM_DEPRECATED_API
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
- [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
+#endif
[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
@@ -432,8 +436,9 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_log_debug = dlm_config.ci_log_debug;
cl->cl_log_info = dlm_config.ci_log_info;
cl->cl_protocol = dlm_config.ci_protocol;
+#ifdef CONFIG_DLM_DEPRECATED_API
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
- cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
+#endif
cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
@@ -954,8 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_LOG_INFO 1
#define DEFAULT_PROTOCOL DLM_PROTO_TCP
#define DEFAULT_MARK 0
+#ifdef CONFIG_DLM_DEPRECATED_API
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
-#define DEFAULT_WAITWARN_US 0
+#endif
#define DEFAULT_NEW_RSB_COUNT 128
#define DEFAULT_RECOVER_CALLBACKS 0
#define DEFAULT_CLUSTER_NAME ""
@@ -971,8 +977,9 @@ struct dlm_config_info dlm_config = {
.ci_log_info = DEFAULT_LOG_INFO,
.ci_protocol = DEFAULT_PROTOCOL,
.ci_mark = DEFAULT_MARK,
+#ifdef CONFIG_DLM_DEPRECATED_API
.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
- .ci_waitwarn_us = DEFAULT_WAITWARN_US,
+#endif
.ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
.ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
.ci_cluster_name = DEFAULT_CLUSTER_NAME
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index df92b0a07fc6..55c5f2c13ebd 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -37,8 +37,9 @@ struct dlm_config_info {
int ci_log_info;
int ci_protocol;
int ci_mark;
+#ifdef CONFIG_DLM_DEPRECATED_API
int ci_timewarn_cs;
- int ci_waitwarn_us;
+#endif
int ci_new_rsb_count;
int ci_recover_callbacks;
char ci_cluster_name[DLM_LOCKSPACE_LEN];
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 776c3ed519f0..8aca8085d24e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -145,7 +145,9 @@ struct dlm_args {
void (*bastfn) (void *astparam, int mode);
int mode;
struct dlm_lksb *lksb;
+#ifdef CONFIG_DLM_DEPRECATED_API
unsigned long timeout;
+#endif
};
@@ -203,10 +205,20 @@ struct dlm_args {
#define DLM_IFL_OVERLAP_UNLOCK 0x00080000
#define DLM_IFL_OVERLAP_CANCEL 0x00100000
#define DLM_IFL_ENDOFLIFE 0x00200000
+#ifdef CONFIG_DLM_DEPRECATED_API
#define DLM_IFL_WATCH_TIMEWARN 0x00400000
#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
+#endif
#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
+/* least significant 2 bytes are message changed, they are full transmitted
+ * but at receive side only the 2 bytes LSB will be set.
+ *
+ * Even wireshark dlm dissector does only evaluate the lower bytes and note
+ * that they may not be used on transceiver side, we assume the higher bytes
+ * are for internal use or reserved so long they are not parsed on receiver
+ * side.
+ */
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
@@ -249,10 +261,12 @@ struct dlm_lkb {
struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
struct list_head lkb_wait_reply; /* waiting for remote reply */
struct list_head lkb_ownqueue; /* list of locks for a process */
- struct list_head lkb_time_list;
ktime_t lkb_timestamp;
- ktime_t lkb_wait_time;
+
+#ifdef CONFIG_DLM_DEPRECATED_API
+ struct list_head lkb_time_list;
unsigned long lkb_timeout_cs;
+#endif
struct mutex lkb_cb_mutex;
struct work_struct lkb_cb_work;
@@ -568,8 +582,10 @@ struct dlm_ls {
struct mutex ls_orphans_mutex;
struct list_head ls_orphans;
+#ifdef CONFIG_DLM_DEPRECATED_API
struct mutex ls_timeout_mutex;
struct list_head ls_timeout;
+#endif
spinlock_t ls_new_rsb_spin;
int ls_new_rsb_count;
@@ -606,8 +622,8 @@ struct dlm_ls {
wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
int ls_uevent_result;
- struct completion ls_members_done;
- int ls_members_result;
+ struct completion ls_recovery_done;
+ int ls_recovery_result;
struct miscdevice ls_device;
@@ -688,7 +704,9 @@ struct dlm_ls {
#define LSFL_RCOM_READY 5
#define LSFL_RCOM_WAIT 6
#define LSFL_UEVENT_WAIT 7
+#ifdef CONFIG_DLM_DEPRECATED_API
#define LSFL_TIMEWARN 8
+#endif
#define LSFL_CB_DELAY 9
#define LSFL_NODIR 10
@@ -741,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
return test_bit(LSFL_NODIR, &ls->ls_flags);
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_netlink_init(void);
void dlm_netlink_exit(void);
void dlm_timeout_warn(struct dlm_lkb *lkb);
+#else
+static inline int dlm_netlink_init(void) { return 0; }
+static inline void dlm_netlink_exit(void) { };
+static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { };
+#endif
int dlm_plock_init(void);
void dlm_plock_exit(void);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 226822f49d30..dac7eb75dba9 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+#ifdef CONFIG_DLM_DEPRECATED_API
/* if the operation was a cancel, then return -DLM_ECANCEL, if a
timeout caused the cancel then return -ETIMEDOUT */
if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
rv = -ETIMEDOUT;
}
+#endif
if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
@@ -1210,7 +1212,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
kref_init(&lkb->lkb_ref);
INIT_LIST_HEAD(&lkb->lkb_ownqueue);
INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
+#ifdef CONFIG_DLM_DEPRECATED_API
INIT_LIST_HEAD(&lkb->lkb_time_list);
+#endif
INIT_LIST_HEAD(&lkb->lkb_cb_list);
mutex_init(&lkb->lkb_cb_mutex);
INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
@@ -1306,6 +1310,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb)
kref_get(&lkb->lkb_ref);
}
+static void unhold_lkb_assert(struct kref *kref)
+{
+ struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+ DLM_ASSERT(false, dlm_print_lkb(lkb););
+}
+
/* This is called when we need to remove a reference and are certain
it's not the last ref. e.g. del_lkb is always called between a
find_lkb/put_lkb and is always the inverse of a previous add_lkb.
@@ -1313,9 +1324,7 @@ static inline void hold_lkb(struct dlm_lkb *lkb)
static inline void unhold_lkb(struct dlm_lkb *lkb)
{
- int rv;
- rv = kref_put(&lkb->lkb_ref, kill_lkb);
- DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+ kref_put(&lkb->lkb_ref, unhold_lkb_assert);
}
static void lkb_add_ordered(struct list_head *new, struct list_head *head,
@@ -1402,75 +1411,6 @@ static int msg_reply_type(int mstype)
return -1;
}
-static int nodeid_warned(int nodeid, int num_nodes, int *warned)
-{
- int i;
-
- for (i = 0; i < num_nodes; i++) {
- if (!warned[i]) {
- warned[i] = nodeid;
- return 0;
- }
- if (warned[i] == nodeid)
- return 1;
- }
- return 0;
-}
-
-void dlm_scan_waiters(struct dlm_ls *ls)
-{
- struct dlm_lkb *lkb;
- s64 us;
- s64 debug_maxus = 0;
- u32 debug_scanned = 0;
- u32 debug_expired = 0;
- int num_nodes = 0;
- int *warned = NULL;
-
- if (!dlm_config.ci_waitwarn_us)
- return;
-
- mutex_lock(&ls->ls_waiters_mutex);
-
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (!lkb->lkb_wait_time)
- continue;
-
- debug_scanned++;
-
- us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
-
- if (us < dlm_config.ci_waitwarn_us)
- continue;
-
- lkb->lkb_wait_time = 0;
-
- debug_expired++;
- if (us > debug_maxus)
- debug_maxus = us;
-
- if (!num_nodes) {
- num_nodes = ls->ls_num_nodes;
- warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL);
- }
- if (!warned)
- continue;
- if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
- continue;
-
- log_error(ls, "waitwarn %x %lld %d us check connection to "
- "node %d", lkb->lkb_id, (long long)us,
- dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
- }
- mutex_unlock(&ls->ls_waiters_mutex);
- kfree(warned);
-
- if (debug_expired)
- log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
- debug_scanned, debug_expired,
- dlm_config.ci_waitwarn_us, (long long)debug_maxus);
-}
-
/* add/remove lkb from global waiters list of lkb's waiting for
a reply from a remote node */
@@ -1514,7 +1454,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
lkb->lkb_wait_count++;
lkb->lkb_wait_type = mstype;
- lkb->lkb_wait_time = ktime_get();
lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
hold_lkb(lkb);
list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
@@ -1842,6 +1781,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls)
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
static void add_timeout(struct dlm_lkb *lkb)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
@@ -1962,17 +1902,11 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
mutex_unlock(&ls->ls_timeout_mutex);
-
- if (!dlm_config.ci_waitwarn_us)
- return;
-
- mutex_lock(&ls->ls_waiters_mutex);
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (ktime_to_us(lkb->lkb_wait_time))
- lkb->lkb_wait_time = ktime_get();
- }
- mutex_unlock(&ls->ls_waiters_mutex);
}
+#else
+static void add_timeout(struct dlm_lkb *lkb) { }
+static void del_timeout(struct dlm_lkb *lkb) { }
+#endif
/* lkb is master or local copy */
@@ -2837,12 +2771,20 @@ static void confirm_master(struct dlm_rsb *r, int error)
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
int namelen, unsigned long timeout_cs,
void (*ast) (void *astparam),
void *astparam,
void (*bast) (void *astparam, int mode),
struct dlm_args *args)
+#else
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+ int namelen, void (*ast)(void *astparam),
+ void *astparam,
+ void (*bast)(void *astparam, int mode),
+ struct dlm_args *args)
+#endif
{
int rv = -EINVAL;
@@ -2895,7 +2837,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
args->astfn = ast;
args->astparam = astparam;
args->bastfn = bast;
+#ifdef CONFIG_DLM_DEPRECATED_API
args->timeout = timeout_cs;
+#endif
args->mode = mode;
args->lksb = lksb;
rv = 0;
@@ -2951,7 +2895,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
lkb->lkb_lksb = args->lksb;
lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
lkb->lkb_ownpid = (int) current->pid;
+#ifdef CONFIG_DLM_DEPRECATED_API
lkb->lkb_timeout_cs = args->timeout;
+#endif
rv = 0;
out:
if (rv)
@@ -3472,10 +3418,15 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error)
goto out;
- trace_dlm_lock_start(ls, lkb, mode, flags);
+ trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
astarg, bast, &args);
+#else
+ error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast,
+ &args);
+#endif
if (error)
goto out_put;
@@ -3487,7 +3438,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error == -EINPROGRESS)
error = 0;
out_put:
- trace_dlm_lock_end(ls, lkb, mode, flags, error);
+ trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error);
if (convert || error)
__put_lkb(ls, lkb);
@@ -5839,9 +5790,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
return 0;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
int mode, uint32_t flags, void *name, unsigned int namelen,
unsigned long timeout_cs)
+#else
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+ int mode, uint32_t flags, void *name, unsigned int namelen)
+#endif
{
struct dlm_lkb *lkb;
struct dlm_args args;
@@ -5864,8 +5820,13 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
goto out;
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
fake_astfn, ua, fake_bastfn, &args);
+#else
+ error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua,
+ fake_bastfn, &args);
+#endif
if (error) {
kfree(ua->lksb.sb_lvbptr);
ua->lksb.sb_lvbptr = NULL;
@@ -5904,9 +5865,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
return error;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
unsigned long timeout_cs)
+#else
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+#endif
{
struct dlm_lkb *lkb;
struct dlm_args args;
@@ -5941,8 +5907,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
ua->bastaddr = ua_tmp->bastaddr;
ua->user_lksb = ua_tmp->user_lksb;
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
fake_astfn, ua, fake_bastfn, &args);
+#else
+ error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua,
+ fake_bastfn, &args);
+#endif
if (error)
goto out_put;
@@ -5966,7 +5937,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, void *name, unsigned int namelen,
- unsigned long timeout_cs, uint32_t *lkid)
+ uint32_t *lkid)
{
struct dlm_lkb *lkb = NULL, *iter;
struct dlm_user_args *ua;
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 252a5898f908..a7b6474f009d 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,9 +24,15 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
void dlm_scan_rsbs(struct dlm_ls *ls);
int dlm_lock_recovery_try(struct dlm_ls *ls);
void dlm_unlock_recovery(struct dlm_ls *ls);
-void dlm_scan_waiters(struct dlm_ls *ls);
+
+#ifdef CONFIG_DLM_DEPRECATED_API
void dlm_scan_timeout(struct dlm_ls *ls);
void dlm_adjust_timeouts(struct dlm_ls *ls);
+#else
+static inline void dlm_scan_timeout(struct dlm_ls *ls) { }
+static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { }
+#endif
+
int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
unsigned int flags, int *r_nodeid, int *result);
@@ -41,15 +47,22 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls);
int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
uint32_t flags, void *name, unsigned int namelen,
unsigned long timeout_cs);
int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
unsigned long timeout_cs);
+#else
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+ uint32_t flags, void *name, unsigned int namelen);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+#endif
int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, void *name, unsigned int namelen,
- unsigned long timeout_cs, uint32_t *lkid);
+ uint32_t *lkid);
int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
uint32_t flags, uint32_t lkid, char *lvb_in);
int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 19ed41a5da93..6e449abdc5f4 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -275,7 +275,6 @@ static int dlm_scand(void *data)
ls->ls_scan_time = jiffies;
dlm_scan_rsbs(ls);
dlm_scan_timeout(ls);
- dlm_scan_waiters(ls);
dlm_unlock_recovery(ls);
} else {
ls->ls_scan_time += HZ;
@@ -490,13 +489,28 @@ static int new_lockspace(const char *name, const char *cluster,
ls->ls_ops_arg = ops_arg;
}
- if (flags & DLM_LSFL_TIMEWARN)
+#ifdef CONFIG_DLM_DEPRECATED_API
+ if (flags & DLM_LSFL_TIMEWARN) {
+ pr_warn_once("===============================================================\n"
+ "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n"
+ " will be removed in v5.22!\n"
+ " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n"
+ "===============================================================\n");
+
set_bit(LSFL_TIMEWARN, &ls->ls_flags);
+ }
/* ls_exflags are forced to match among nodes, and we don't
- need to require all nodes to have some flags set */
+ * need to require all nodes to have some flags set
+ */
ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
DLM_LSFL_NEWEXCL));
+#else
+ /* ls_exflags are forced to match among nodes, and we don't
+ * need to require all nodes to have some flags set
+ */
+ ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL));
+#endif
size = READ_ONCE(dlm_config.ci_rsbtbl_size);
ls->ls_rsbtbl_size = size;
@@ -527,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster,
mutex_init(&ls->ls_waiters_mutex);
INIT_LIST_HEAD(&ls->ls_orphans);
mutex_init(&ls->ls_orphans_mutex);
+#ifdef CONFIG_DLM_DEPRECATED_API
INIT_LIST_HEAD(&ls->ls_timeout);
mutex_init(&ls->ls_timeout_mutex);
+#endif
INIT_LIST_HEAD(&ls->ls_new_rsb);
spin_lock_init(&ls->ls_new_rsb_spin);
@@ -548,8 +564,8 @@ static int new_lockspace(const char *name, const char *cluster,
init_waitqueue_head(&ls->ls_uevent_wait);
ls->ls_uevent_result = 0;
- init_completion(&ls->ls_members_done);
- ls->ls_members_result = -1;
+ init_completion(&ls->ls_recovery_done);
+ ls->ls_recovery_result = -1;
mutex_init(&ls->ls_cb_mutex);
INIT_LIST_HEAD(&ls->ls_cb_delay);
@@ -645,8 +661,9 @@ static int new_lockspace(const char *name, const char *cluster,
if (error)
goto out_recoverd;
- wait_for_completion(&ls->ls_members_done);
- error = ls->ls_members_result;
+ /* wait until recovery is successful or failed */
+ wait_for_completion(&ls->ls_recovery_done);
+ error = ls->ls_recovery_result;
if (error)
goto out_members;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 19e82f08c0e0..a4e84e8d94c8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -529,7 +529,7 @@ static void lowcomms_write_space(struct sock *sk)
return;
if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
- log_print("successful connected to node %d", con->nodeid);
+ log_print("connected to node %d", con->nodeid);
queue_work(send_workqueue, &con->swork);
return;
}
@@ -1931,7 +1931,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
return ret;
if (!test_and_set_bit(CF_CONNECTED, &con->flags))
- log_print("successful connected to node %d", con->nodeid);
+ log_print("connected to node %d", con->nodeid);
return 0;
}
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 98084e0cfccf..2af2ccfe43a9 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -534,7 +534,11 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
int i, error, neg = 0, low = -1;
/* previously removed members that we've not finished removing need to
- count as a negative change so the "neg" recovery steps will happen */
+ * count as a negative change so the "neg" recovery steps will happen
+ *
+ * This functionality must report all member changes to lsops or
+ * midcomms layer and must never return before.
+ */
list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
log_rinfo(ls, "prev removed member %d", memb->nodeid);
@@ -583,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
*neg_out = neg;
error = ping_members(ls);
- /* error -EINTR means that a new recovery action is triggered.
- * We ignore this recovery action and let run the new one which might
- * have new member configuration.
- */
- if (error == -EINTR)
- error = 0;
-
- /* new_lockspace() may be waiting to know if the config
- * is good or bad
- */
- ls->ls_members_result = error;
- complete(&ls->ls_members_done);
-
log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
}
@@ -675,7 +666,16 @@ int dlm_ls_stop(struct dlm_ls *ls)
if (!ls->ls_recover_begin)
ls->ls_recover_begin = jiffies;
- dlm_lsop_recover_prep(ls);
+ /* call recover_prep ops only once and not multiple times
+ * for each possible dlm_ls_stop() when recovery is already
+ * stopped.
+ *
+ * If we successful was able to clear LSFL_RUNNING bit and
+ * it was set we know it is the first dlm_ls_stop() call.
+ */
+ if (new)
+ dlm_lsop_recover_prep(ls);
+
return 0;
}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 0993eebf2060..737f185aad8d 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -29,6 +29,8 @@ struct plock_async_data {
struct plock_op {
struct list_head list;
int done;
+ /* if lock op got interrupted while waiting dlm_controld reply */
+ bool sigint;
struct dlm_plock_info info;
/* if set indicates async handling */
struct plock_async_data *data;
@@ -79,8 +81,7 @@ static void send_op(struct plock_op *op)
abandoned waiter. So, we have to insert the unlock-close when the
lock call is interrupted. */
-static void do_unlock_close(struct dlm_ls *ls, u64 number,
- struct file *file, struct file_lock *fl)
+static void do_unlock_close(const struct dlm_plock_info *info)
{
struct plock_op *op;
@@ -89,15 +90,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number,
return;
op->info.optype = DLM_PLOCK_OP_UNLOCK;
- op->info.pid = fl->fl_pid;
- op->info.fsid = ls->ls_global_id;
- op->info.number = number;
+ op->info.pid = info->pid;
+ op->info.fsid = info->fsid;
+ op->info.number = info->number;
op->info.start = 0;
op->info.end = OFFSET_MAX;
- if (fl->fl_lmops && fl->fl_lmops->lm_grant)
- op->info.owner = (__u64) fl->fl_pid;
- else
- op->info.owner = (__u64)(long) fl->fl_owner;
+ op->info.owner = info->owner;
op->info.flags |= DLM_PLOCK_FL_CLOSE;
send_op(op);
@@ -161,16 +159,24 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = wait_event_interruptible(recv_wq, (op->done != 0));
if (rv == -ERESTARTSYS) {
spin_lock(&ops_lock);
- list_del(&op->list);
+ /* recheck under ops_lock if we got a done != 0,
+ * if so this interrupt case should be ignored
+ */
+ if (op->done != 0) {
+ spin_unlock(&ops_lock);
+ goto do_lock_wait;
+ }
+
+ op->sigint = true;
spin_unlock(&ops_lock);
- log_print("%s: wait interrupted %x %llx, op removed",
+ log_debug(ls, "%s: wait interrupted %x %llx pid %d",
__func__, ls->ls_global_id,
- (unsigned long long)number);
- dlm_release_plock_op(op);
- do_unlock_close(ls, number, file, fl);
+ (unsigned long long)number, op->info.pid);
goto out;
}
+do_lock_wait:
+
WARN_ON(!list_empty(&op->list));
rv = op->info.rv;
@@ -378,7 +384,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
spin_lock(&ops_lock);
if (!list_empty(&send_list)) {
- op = list_entry(send_list.next, struct plock_op, list);
+ op = list_first_entry(&send_list, struct plock_op, list);
if (op->info.flags & DLM_PLOCK_FL_CLOSE)
list_del(&op->list);
else
@@ -425,6 +431,19 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
if (iter->info.fsid == info.fsid &&
iter->info.number == info.number &&
iter->info.owner == info.owner) {
+ if (iter->sigint) {
+ list_del(&iter->list);
+ spin_unlock(&ops_lock);
+
+ pr_debug("%s: sigint cleanup %x %llx pid %d",
+ __func__, iter->info.fsid,
+ (unsigned long long)iter->info.number,
+ iter->info.pid);
+ do_unlock_close(&iter->info);
+ memcpy(&iter->info, &info, sizeof(info));
+ dlm_release_plock_op(iter);
+ return count;
+ }
list_del_init(&iter->list);
memcpy(&iter->info, &info, sizeof(info));
if (iter->data)
@@ -443,7 +462,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
else
wake_up(&recv_wq);
} else
- log_print("%s: no op %x %llx - may got interrupted?", __func__,
+ log_print("%s: no op %x %llx", __func__,
info.fsid, (unsigned long long)info.number);
return count;
}
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index a55dfce705dd..e15eb511b04b 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -70,6 +70,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
/*
* Add or remove nodes from the lockspace's ls_nodes list.
+ *
+ * Due to the fact that we must report all membership changes to lsops
+ * or midcomms layer, it is not permitted to abort ls_recover() until
+ * this is done.
*/
error = dlm_recover_members(ls, rv, &neg);
@@ -239,14 +243,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
jiffies_to_msecs(jiffies - start));
mutex_unlock(&ls->ls_recoverd_active);
- dlm_lsop_recover_done(ls);
return 0;
fail:
dlm_release_root_list(ls);
- log_rinfo(ls, "dlm_recover %llu error %d",
- (unsigned long long)rv->seq, error);
mutex_unlock(&ls->ls_recoverd_active);
+
return error;
}
@@ -257,6 +259,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
static void do_ls_recovery(struct dlm_ls *ls)
{
struct dlm_recover *rv = NULL;
+ int error;
spin_lock(&ls->ls_recover_lock);
rv = ls->ls_recover_args;
@@ -266,7 +269,31 @@ static void do_ls_recovery(struct dlm_ls *ls)
spin_unlock(&ls->ls_recover_lock);
if (rv) {
- ls_recover(ls, rv);
+ error = ls_recover(ls, rv);
+ switch (error) {
+ case 0:
+ ls->ls_recovery_result = 0;
+ complete(&ls->ls_recovery_done);
+
+ dlm_lsop_recover_done(ls);
+ break;
+ case -EINTR:
+ /* if recovery was interrupted -EINTR we wait for the next
+ * ls_recover() iteration until it hopefully succeeds.
+ */
+ log_rinfo(ls, "%s %llu interrupted and should be queued to run again",
+ __func__, (unsigned long long)rv->seq);
+ break;
+ default:
+ log_rinfo(ls, "%s %llu error %d", __func__,
+ (unsigned long long)rv->seq, error);
+
+ /* let new_lockspace() get aware of critical error */
+ ls->ls_recovery_result = error;
+ complete(&ls->ls_recovery_done);
+ break;
+ }
+
kfree(rv->nodes);
kfree(rv);
}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 1060b24f18d4..999918348b31 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -250,6 +250,14 @@ static int device_user_lock(struct dlm_user_proc *proc,
goto out;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
+ if (params->timeout)
+ pr_warn_once("========================================================\n"
+ "WARNING: the lkb timeout feature is being deprecated and\n"
+ " will be removed in v5.22!\n"
+ "========================================================\n");
+#endif
+
ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
if (!ua)
goto out;
@@ -262,23 +270,34 @@ static int device_user_lock(struct dlm_user_proc *proc,
ua->xid = params->xid;
if (params->flags & DLM_LKF_CONVERT) {
+#ifdef CONFIG_DLM_DEPRECATED_API
error = dlm_user_convert(ls, ua,
params->mode, params->flags,
params->lkid, params->lvb,
(unsigned long) params->timeout);
+#else
+ error = dlm_user_convert(ls, ua,
+ params->mode, params->flags,
+ params->lkid, params->lvb);
+#endif
} else if (params->flags & DLM_LKF_ORPHAN) {
error = dlm_user_adopt_orphan(ls, ua,
params->mode, params->flags,
params->name, params->namelen,
- (unsigned long) params->timeout,
&lkid);
if (!error)
error = lkid;
} else {
+#ifdef CONFIG_DLM_DEPRECATED_API
error = dlm_user_request(ls, ua,
params->mode, params->flags,
params->name, params->namelen,
(unsigned long) params->timeout);
+#else
+ error = dlm_user_request(ls, ua,
+ params->mode, params->flags,
+ params->name, params->namelen);
+#endif
if (!error)
error = ua->lksb.sb_lkid;
}
diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile
index 0b1c5e63eb71..7bfc2f9754a8 100644
--- a/fs/efivarfs/Makefile
+++ b/fs/efivarfs/Makefile
@@ -5,4 +5,4 @@
obj-$(CONFIG_EFIVAR_FS) += efivarfs.o
-efivarfs-objs := inode.o file.o super.o
+efivarfs-objs := inode.o file.o super.o vars.o
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 30ae44cb7453..8ebf3a6a8aa2 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -7,6 +7,46 @@
#define EFIVAR_FS_INTERNAL_H
#include <linux/list.h>
+#include <linux/efi.h>
+
+struct efi_variable {
+ efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
+ efi_guid_t VendorGuid;
+ unsigned long DataSize;
+ __u8 Data[1024];
+ efi_status_t Status;
+ __u32 Attributes;
+} __attribute__((packed));
+
+struct efivar_entry {
+ struct efi_variable var;
+ struct list_head list;
+ struct kobject kobj;
+};
+
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+ void *data, bool duplicates, struct list_head *head);
+
+int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
+void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
+void efivar_entry_remove(struct efivar_entry *entry);
+int efivar_entry_delete(struct efivar_entry *entry);
+
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
+int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+ unsigned long *size, void *data);
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+ unsigned long *size, void *data);
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+ unsigned long *size, void *data, bool *set);
+
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+ struct list_head *head, void *data);
+
+bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
+ unsigned long data_size);
+bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
+ size_t len);
extern const struct file_operations efivarfs_file_operations;
extern const struct inode_operations efivarfs_dir_inode_operations;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 15880a68faad..6780fc81cc11 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -155,10 +155,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
goto fail_inode;
}
- efivar_entry_size(entry, &size);
- err = efivar_entry_add(entry, &efivarfs_list);
- if (err)
- goto fail_inode;
+ __efivar_entry_get(entry, NULL, &size, NULL);
+ __efivar_entry_add(entry, &efivarfs_list);
/* copied by the above to local storage in the dentry. */
kfree(name);
@@ -182,10 +180,7 @@ fail:
static int efivarfs_destroy(struct efivar_entry *entry, void *data)
{
- int err = efivar_entry_remove(entry);
-
- if (err)
- return err;
+ efivar_entry_remove(entry);
kfree(entry);
return 0;
}
@@ -221,7 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list);
if (err)
- __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+ efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
return err;
}
@@ -246,7 +241,7 @@ static void efivarfs_kill_sb(struct super_block *sb)
kill_litter_super(sb);
/* Remove all entries and destroy */
- __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+ efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
}
static struct file_system_type efivarfs_type = {
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
new file mode 100644
index 000000000000..a0ef63cfcecb
--- /dev/null
+++ b/fs/efivarfs/vars.c
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Originally from efivars.c
+ *
+ * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
+ * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
+ */
+
+#include <linux/capability.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/efi.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/ucs2_string.h>
+
+#include "internal.h"
+
+MODULE_IMPORT_NS(EFIVAR);
+
+static bool
+validate_device_path(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ struct efi_generic_dev_path *node;
+ int offset = 0;
+
+ node = (struct efi_generic_dev_path *)buffer;
+
+ if (len < sizeof(*node))
+ return false;
+
+ while (offset <= len - sizeof(*node) &&
+ node->length >= sizeof(*node) &&
+ node->length <= len - offset) {
+ offset += node->length;
+
+ if ((node->type == EFI_DEV_END_PATH ||
+ node->type == EFI_DEV_END_PATH2) &&
+ node->sub_type == EFI_DEV_END_ENTIRE)
+ return true;
+
+ node = (struct efi_generic_dev_path *)(buffer + offset);
+ }
+
+ /*
+ * If we're here then either node->length pointed past the end
+ * of the buffer or we reached the end of the buffer without
+ * finding a device path end node.
+ */
+ return false;
+}
+
+static bool
+validate_boot_order(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ /* An array of 16-bit integers */
+ if ((len % 2) != 0)
+ return false;
+
+ return true;
+}
+
+static bool
+validate_load_option(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ u16 filepathlength;
+ int i, desclength = 0, namelen;
+
+ namelen = ucs2_strnlen(var_name, EFI_VAR_NAME_LEN);
+
+ /* Either "Boot" or "Driver" followed by four digits of hex */
+ for (i = match; i < match+4; i++) {
+ if (var_name[i] > 127 ||
+ hex_to_bin(var_name[i] & 0xff) < 0)
+ return true;
+ }
+
+ /* Reject it if there's 4 digits of hex and then further content */
+ if (namelen > match + 4)
+ return false;
+
+ /* A valid entry must be at least 8 bytes */
+ if (len < 8)
+ return false;
+
+ filepathlength = buffer[4] | buffer[5] << 8;
+
+ /*
+ * There's no stored length for the description, so it has to be
+ * found by hand
+ */
+ desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
+
+ /* Each boot entry must have a descriptor */
+ if (!desclength)
+ return false;
+
+ /*
+ * If the sum of the length of the description, the claimed filepath
+ * length and the original header are greater than the length of the
+ * variable, it's malformed
+ */
+ if ((desclength + filepathlength + 6) > len)
+ return false;
+
+ /*
+ * And, finally, check the filepath
+ */
+ return validate_device_path(var_name, match, buffer + desclength + 6,
+ filepathlength);
+}
+
+static bool
+validate_uint16(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ /* A single 16-bit integer */
+ if (len != 2)
+ return false;
+
+ return true;
+}
+
+static bool
+validate_ascii_string(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (buffer[i] > 127)
+ return false;
+
+ if (buffer[i] == 0)
+ return true;
+ }
+
+ return false;
+}
+
+struct variable_validate {
+ efi_guid_t vendor;
+ char *name;
+ bool (*validate)(efi_char16_t *var_name, int match, u8 *data,
+ unsigned long len);
+};
+
+/*
+ * This is the list of variables we need to validate, as well as the
+ * whitelist for what we think is safe not to default to immutable.
+ *
+ * If it has a validate() method that's not NULL, it'll go into the
+ * validation routine. If not, it is assumed valid, but still used for
+ * whitelisting.
+ *
+ * Note that it's sorted by {vendor,name}, but globbed names must come after
+ * any other name with the same prefix.
+ */
+static const struct variable_validate variable_validate[] = {
+ { EFI_GLOBAL_VARIABLE_GUID, "BootNext", validate_uint16 },
+ { EFI_GLOBAL_VARIABLE_GUID, "BootOrder", validate_boot_order },
+ { EFI_GLOBAL_VARIABLE_GUID, "Boot*", validate_load_option },
+ { EFI_GLOBAL_VARIABLE_GUID, "DriverOrder", validate_boot_order },
+ { EFI_GLOBAL_VARIABLE_GUID, "Driver*", validate_load_option },
+ { EFI_GLOBAL_VARIABLE_GUID, "ConIn", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ConInDev", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ConOut", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ConOutDev", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ErrOut", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ErrOutDev", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "Lang", validate_ascii_string },
+ { EFI_GLOBAL_VARIABLE_GUID, "OsIndications", NULL },
+ { EFI_GLOBAL_VARIABLE_GUID, "PlatformLang", validate_ascii_string },
+ { EFI_GLOBAL_VARIABLE_GUID, "Timeout", validate_uint16 },
+ { LINUX_EFI_CRASH_GUID, "*", NULL },
+ { NULL_GUID, "", NULL },
+};
+
+/*
+ * Check if @var_name matches the pattern given in @match_name.
+ *
+ * @var_name: an array of @len non-NUL characters.
+ * @match_name: a NUL-terminated pattern string, optionally ending in "*". A
+ * final "*" character matches any trailing characters @var_name,
+ * including the case when there are none left in @var_name.
+ * @match: on output, the number of non-wildcard characters in @match_name
+ * that @var_name matches, regardless of the return value.
+ * @return: whether @var_name fully matches @match_name.
+ */
+static bool
+variable_matches(const char *var_name, size_t len, const char *match_name,
+ int *match)
+{
+ for (*match = 0; ; (*match)++) {
+ char c = match_name[*match];
+
+ switch (c) {
+ case '*':
+ /* Wildcard in @match_name means we've matched. */
+ return true;
+
+ case '\0':
+ /* @match_name has ended. Has @var_name too? */
+ return (*match == len);
+
+ default:
+ /*
+ * We've reached a non-wildcard char in @match_name.
+ * Continue only if there's an identical character in
+ * @var_name.
+ */
+ if (*match < len && c == var_name[*match])
+ continue;
+ return false;
+ }
+ }
+}
+
+bool
+efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
+ unsigned long data_size)
+{
+ int i;
+ unsigned long utf8_size;
+ u8 *utf8_name;
+
+ utf8_size = ucs2_utf8size(var_name);
+ utf8_name = kmalloc(utf8_size + 1, GFP_KERNEL);
+ if (!utf8_name)
+ return false;
+
+ ucs2_as_utf8(utf8_name, var_name, utf8_size);
+ utf8_name[utf8_size] = '\0';
+
+ for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
+ const char *name = variable_validate[i].name;
+ int match = 0;
+
+ if (efi_guidcmp(vendor, variable_validate[i].vendor))
+ continue;
+
+ if (variable_matches(utf8_name, utf8_size+1, name, &match)) {
+ if (variable_validate[i].validate == NULL)
+ break;
+ kfree(utf8_name);
+ return variable_validate[i].validate(var_name, match,
+ data, data_size);
+ }
+ }
+ kfree(utf8_name);
+ return true;
+}
+
+bool
+efivar_variable_is_removable(efi_guid_t vendor, const char *var_name,
+ size_t len)
+{
+ int i;
+ bool found = false;
+ int match = 0;
+
+ /*
+ * Check if our variable is in the validated variables list
+ */
+ for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
+ if (efi_guidcmp(variable_validate[i].vendor, vendor))
+ continue;
+
+ if (variable_matches(var_name, len,
+ variable_validate[i].name, &match)) {
+ found = true;
+ break;
+ }
+ }
+
+ /*
+ * If it's in our list, it is removable.
+ */
+ return found;
+}
+
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
+ struct list_head *head)
+{
+ struct efivar_entry *entry, *n;
+ unsigned long strsize1, strsize2;
+ bool found = false;
+
+ strsize1 = ucs2_strsize(variable_name, 1024);
+ list_for_each_entry_safe(entry, n, head, list) {
+ strsize2 = ucs2_strsize(entry->var.VariableName, 1024);
+ if (strsize1 == strsize2 &&
+ !memcmp(variable_name, &(entry->var.VariableName),
+ strsize2) &&
+ !efi_guidcmp(entry->var.VendorGuid,
+ *vendor)) {
+ found = true;
+ break;
+ }
+ }
+ return found;
+}
+
+/*
+ * Returns the size of variable_name, in bytes, including the
+ * terminating NULL character, or variable_name_size if no NULL
+ * character is found among the first variable_name_size bytes.
+ */
+static unsigned long var_name_strnsize(efi_char16_t *variable_name,
+ unsigned long variable_name_size)
+{
+ unsigned long len;
+ efi_char16_t c;
+
+ /*
+ * The variable name is, by definition, a NULL-terminated
+ * string, so make absolutely sure that variable_name_size is
+ * the value we expect it to be. If not, return the real size.
+ */
+ for (len = 2; len <= variable_name_size; len += sizeof(c)) {
+ c = variable_name[(len / sizeof(c)) - 1];
+ if (!c)
+ break;
+ }
+
+ return min(len, variable_name_size);
+}
+
+/*
+ * Print a warning when duplicate EFI variables are encountered and
+ * disable the sysfs workqueue since the firmware is buggy.
+ */
+static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
+ unsigned long len16)
+{
+ size_t i, len8 = len16 / sizeof(efi_char16_t);
+ char *str8;
+
+ str8 = kzalloc(len8, GFP_KERNEL);
+ if (!str8)
+ return;
+
+ for (i = 0; i < len8; i++)
+ str8[i] = str16[i];
+
+ printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n",
+ str8, vendor_guid);
+ kfree(str8);
+}
+
+/**
+ * efivar_init - build the initial list of EFI variables
+ * @func: callback function to invoke for every variable
+ * @data: function-specific data to pass to @func
+ * @duplicates: error if we encounter duplicates on @head?
+ * @head: initialised head of variable list
+ *
+ * Get every EFI variable from the firmware and invoke @func. @func
+ * should call efivar_entry_add() to build the list of variables.
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+ void *data, bool duplicates, struct list_head *head)
+{
+ unsigned long variable_name_size = 1024;
+ efi_char16_t *variable_name;
+ efi_status_t status;
+ efi_guid_t vendor_guid;
+ int err = 0;
+
+ variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+ if (!variable_name) {
+ printk(KERN_ERR "efivars: Memory allocation failed.\n");
+ return -ENOMEM;
+ }
+
+ err = efivar_lock();
+ if (err)
+ goto free;
+
+ /*
+ * Per EFI spec, the maximum storage allocated for both
+ * the variable name and variable data is 1024 bytes.
+ */
+
+ do {
+ variable_name_size = 1024;
+
+ status = efivar_get_next_variable(&variable_name_size,
+ variable_name,
+ &vendor_guid);
+ switch (status) {
+ case EFI_SUCCESS:
+ variable_name_size = var_name_strnsize(variable_name,
+ variable_name_size);
+
+ /*
+ * Some firmware implementations return the
+ * same variable name on multiple calls to
+ * get_next_variable(). Terminate the loop
+ * immediately as there is no guarantee that
+ * we'll ever see a different variable name,
+ * and may end up looping here forever.
+ */
+ if (duplicates &&
+ variable_is_present(variable_name, &vendor_guid,
+ head)) {
+ dup_variable_bug(variable_name, &vendor_guid,
+ variable_name_size);
+ status = EFI_NOT_FOUND;
+ } else {
+ err = func(variable_name, vendor_guid,
+ variable_name_size, data);
+ if (err)
+ status = EFI_NOT_FOUND;
+ }
+ break;
+ case EFI_UNSUPPORTED:
+ err = -EOPNOTSUPP;
+ status = EFI_NOT_FOUND;
+ break;
+ case EFI_NOT_FOUND:
+ break;
+ default:
+ printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
+ status);
+ status = EFI_NOT_FOUND;
+ break;
+ }
+
+ } while (status != EFI_NOT_FOUND);
+
+ efivar_unlock();
+free:
+ kfree(variable_name);
+
+ return err;
+}
+
+/**
+ * efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+int efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+ int err;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+ list_add(&entry->list, head);
+ efivar_unlock();
+
+ return 0;
+}
+
+/**
+ * __efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ */
+void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+ list_add(&entry->list, head);
+}
+
+/**
+ * efivar_entry_remove - remove entry from variable list
+ * @entry: entry to remove from list
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+void efivar_entry_remove(struct efivar_entry *entry)
+{
+ list_del(&entry->list);
+}
+
+/*
+ * efivar_entry_list_del_unlock - remove entry from variable list
+ * @entry: entry to remove
+ *
+ * Remove @entry from the variable list and release the list lock.
+ *
+ * NOTE: slightly weird locking semantics here - we expect to be
+ * called with the efivars lock already held, and we release it before
+ * returning. This is because this function is usually called after
+ * set_variable() while the lock is still held.
+ */
+static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
+{
+ list_del(&entry->list);
+ efivar_unlock();
+}
+
+/**
+ * efivar_entry_delete - delete variable and remove entry from list
+ * @entry: entry containing variable to delete
+ *
+ * Delete the variable from the firmware and remove @entry from the
+ * variable list. It is the caller's responsibility to free @entry
+ * once we return.
+ *
+ * Returns 0 on success, -EINTR if we can't grab the semaphore,
+ * converted EFI status code if set_variable() fails.
+ */
+int efivar_entry_delete(struct efivar_entry *entry)
+{
+ efi_status_t status;
+ int err;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+
+ status = efivar_set_variable_locked(entry->var.VariableName,
+ &entry->var.VendorGuid,
+ 0, 0, NULL, false);
+ if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) {
+ efivar_unlock();
+ return efi_status_to_err(status);
+ }
+
+ efivar_entry_list_del_unlock(entry);
+ return 0;
+}
+
+/**
+ * efivar_entry_size - obtain the size of a variable
+ * @entry: entry for this variable
+ * @size: location to store the variable's size
+ */
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+{
+ efi_status_t status;
+ int err;
+
+ *size = 0;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+
+ status = efivar_get_variable(entry->var.VariableName,
+ &entry->var.VendorGuid, NULL, size, NULL);
+ efivar_unlock();
+
+ if (status != EFI_BUFFER_TOO_SMALL)
+ return efi_status_to_err(status);
+
+ return 0;
+}
+
+/**
+ * __efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ *
+ * The caller MUST call efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() before and after the invocation of this
+ * function, respectively.
+ */
+int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+ unsigned long *size, void *data)
+{
+ efi_status_t status;
+
+ status = efivar_get_variable(entry->var.VariableName,
+ &entry->var.VendorGuid,
+ attributes, size, data);
+
+ return efi_status_to_err(status);
+}
+
+/**
+ * efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ */
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+ unsigned long *size, void *data)
+{
+ int err;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+ err = __efivar_entry_get(entry, attributes, size, data);
+ efivar_unlock();
+
+ return 0;
+}
+
+/**
+ * efivar_entry_set_get_size - call set_variable() and get new size (atomic)
+ * @entry: entry containing variable to set and get
+ * @attributes: attributes of variable to be written
+ * @size: size of data buffer
+ * @data: buffer containing data to write
+ * @set: did the set_variable() call succeed?
+ *
+ * This is a pretty special (complex) function. See efivarfs_file_write().
+ *
+ * Atomically call set_variable() for @entry and if the call is
+ * successful, return the new size of the variable from get_variable()
+ * in @size. The success of set_variable() is indicated by @set.
+ *
+ * Returns 0 on success, -EINVAL if the variable data is invalid,
+ * -ENOSPC if the firmware does not have enough available space, or a
+ * converted EFI status code if either of set_variable() or
+ * get_variable() fail.
+ *
+ * If the EFI variable does not exist when calling set_variable()
+ * (EFI_NOT_FOUND), @entry is removed from the variable list.
+ */
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+ unsigned long *size, void *data, bool *set)
+{
+ efi_char16_t *name = entry->var.VariableName;
+ efi_guid_t *vendor = &entry->var.VendorGuid;
+ efi_status_t status;
+ int err;
+
+ *set = false;
+
+ if (efivar_validate(*vendor, name, data, *size) == false)
+ return -EINVAL;
+
+ /*
+ * The lock here protects the get_variable call, the conditional
+ * set_variable call, and removal of the variable from the efivars
+ * list (in the case of an authenticated delete).
+ */
+ err = efivar_lock();
+ if (err)
+ return err;
+
+ /*
+ * Ensure that the available space hasn't shrunk below the safe level
+ */
+ status = check_var_size(attributes, *size + ucs2_strsize(name, 1024));
+ if (status != EFI_SUCCESS) {
+ if (status != EFI_UNSUPPORTED) {
+ err = efi_status_to_err(status);
+ goto out;
+ }
+
+ if (*size > 65536) {
+ err = -ENOSPC;
+ goto out;
+ }
+ }
+
+ status = efivar_set_variable_locked(name, vendor, attributes, *size,
+ data, false);
+ if (status != EFI_SUCCESS) {
+ err = efi_status_to_err(status);
+ goto out;
+ }
+
+ *set = true;
+
+ /*
+ * Writing to the variable may have caused a change in size (which
+ * could either be an append or an overwrite), or the variable to be
+ * deleted. Perform a GetVariable() so we can tell what actually
+ * happened.
+ */
+ *size = 0;
+ status = efivar_get_variable(entry->var.VariableName,
+ &entry->var.VendorGuid,
+ NULL, size, NULL);
+
+ if (status == EFI_NOT_FOUND)
+ efivar_entry_list_del_unlock(entry);
+ else
+ efivar_unlock();
+
+ if (status && status != EFI_BUFFER_TOO_SMALL)
+ return efi_status_to_err(status);
+
+ return 0;
+
+out:
+ efivar_unlock();
+ return err;
+
+}
+
+/**
+ * efivar_entry_iter - iterate over variable list
+ * @func: callback function
+ * @head: head of variable list
+ * @data: function-specific data to pass to callback
+ *
+ * Iterate over the list of EFI variables and call @func with every
+ * entry on the list. It is safe for @func to remove entries in the
+ * list via efivar_entry_delete() while iterating.
+ *
+ * Some notes for the callback function:
+ * - a non-zero return value indicates an error and terminates the loop
+ * - @func is called from atomic context
+ */
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+ struct list_head *head, void *data)
+{
+ struct efivar_entry *entry, *n;
+ int err = 0;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+
+ list_for_each_entry_safe(entry, n, head, list) {
+ err = func(entry, data);
+ if (err)
+ break;
+ }
+ efivar_unlock();
+
+ return err;
+}
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 05a3063cf2bc..5e59b3f523eb 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -143,6 +143,7 @@ again:
DBG_BUGON(z_erofs_lzma_head);
z_erofs_lzma_head = head;
spin_unlock(&z_erofs_lzma_lock);
+ wake_up_all(&z_erofs_lzma_wq);
z_erofs_lzma_max_dictsize = dict_size;
mutex_unlock(&lzma_resize_mutex);
diff --git a/fs/exec.c b/fs/exec.c
index 0989fb8472a1..5ad26d9536c8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -65,6 +65,7 @@
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
+#include <linux/time_namespace.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -982,10 +983,12 @@ static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
+ bool vfork;
int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
+ vfork = !!tsk->vfork_done;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
if (old_mm)
@@ -1030,6 +1033,10 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
+
+ if (vfork)
+ timens_on_fork(tsk->nsproxy, tsk);
+
if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
@@ -1882,7 +1889,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
- is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+ is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 43de293cef56..8f597753ac12 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -200,19 +200,19 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n,
int quiet, void **page_addr)
{
struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
- if (!IS_ERR(page)) {
- *page_addr = kmap_local_page(page);
- if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !ext2_check_page(page, quiet,
- *page_addr))
- goto fail;
- }
+ struct folio *folio = read_mapping_folio(mapping, n, NULL);
+
+ if (IS_ERR(folio))
+ return &folio->page;
+ *page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1));
+ if (unlikely(!folio_test_checked(folio))) {
+ if (!ext2_check_page(&folio->page, quiet, *page_addr))
+ goto fail;
}
- return page;
+ return &folio->page;
fail:
- ext2_put_page(page, *page_addr);
+ ext2_put_page(&folio->page, *page_addr);
return ERR_PTR(-EIO);
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index d4f306aa5ace..28de11a22e5f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -795,7 +795,6 @@ extern const struct file_operations ext2_file_operations;
/* inode.c */
extern void ext2_set_file_ops(struct inode *inode);
extern const struct address_space_operations ext2_aops;
-extern const struct address_space_operations ext2_nobh_aops;
extern const struct iomap_ops ext2_iomap_ops;
/* namei.c */
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e6b932219803..918ab2f9e4c0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -908,25 +908,6 @@ static int ext2_write_end(struct file *file, struct address_space *mapping,
return ret;
}
-static int
-ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
-{
- int ret;
-
- ret = nobh_write_begin(mapping, pos, len, pagep, fsdata,
- ext2_get_block);
- if (ret < 0)
- ext2_write_failed(mapping, pos + len);
- return ret;
-}
-
-static int ext2_nobh_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- return nobh_writepage(page, ext2_get_block, wbc);
-}
-
static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,ext2_get_block);
@@ -973,26 +954,11 @@ const struct address_space_operations ext2_aops = {
.bmap = ext2_bmap,
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
-const struct address_space_operations ext2_nobh_aops = {
- .dirty_folio = block_dirty_folio,
- .invalidate_folio = block_invalidate_folio,
- .read_folio = ext2_read_folio,
- .readahead = ext2_readahead,
- .writepage = ext2_nobh_writepage,
- .write_begin = ext2_nobh_write_begin,
- .write_end = nobh_write_end,
- .bmap = ext2_bmap,
- .direct_IO = ext2_direct_IO,
- .writepages = ext2_writepages,
- .migratepage = buffer_migrate_page,
- .error_remove_page = generic_error_remove_page,
-};
-
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
.direct_IO = noop_direct_IO,
@@ -1298,13 +1264,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
inode_dio_wait(inode);
- if (IS_DAX(inode)) {
+ if (IS_DAX(inode))
error = dax_zero_range(inode, newsize,
PAGE_ALIGN(newsize) - newsize, NULL,
&ext2_iomap_ops);
- } else if (test_opt(inode->i_sb, NOBH))
- error = nobh_truncate_page(inode->i_mapping,
- newsize, ext2_get_block);
else
error = block_truncate_page(inode->i_mapping,
newsize, ext2_get_block);
@@ -1396,8 +1359,6 @@ void ext2_set_file_ops(struct inode *inode)
inode->i_fop = &ext2_file_operations;
if (IS_DAX(inode))
inode->i_mapping->a_ops = &ext2_dax_aops;
- else if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
}
@@ -1497,10 +1458,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
- if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- else
- inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISLNK(inode->i_mode)) {
if (ext2_inode_is_fast_symlink(inode)) {
inode->i_link = (char *)ei->i_data;
@@ -1510,10 +1468,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
} else {
inode->i_op = &ext2_symlink_inode_operations;
inode_nohighmem(inode);
- if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- else
- inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_mapping->a_ops = &ext2_aops;
}
} else {
inode->i_op = &ext2_special_inode_operations;
@@ -1679,14 +1634,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (error)
return error;
- if (is_quota_modification(inode, iattr)) {
+ if (is_quota_modification(mnt_userns, inode, iattr)) {
error = dquot_initialize(inode);
if (error)
return error;
}
- if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
- (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
- error = dquot_transfer(inode, iattr);
+ if (i_uid_needs_update(mnt_userns, iattr, inode) ||
+ i_gid_needs_update(mnt_userns, iattr, inode)) {
+ error = dquot_transfer(mnt_userns, inode, iattr);
if (error)
return error;
}
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 5f6b7560eb3f..5fd9a22d2b70 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -178,10 +178,7 @@ static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir,
/* slow symlink */
inode->i_op = &ext2_symlink_inode_operations;
inode_nohighmem(inode);
- if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- else
- inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_mapping->a_ops = &ext2_aops;
err = page_symlink(inode, symname, l);
if (err)
goto out_fail;
@@ -247,10 +244,7 @@ static int ext2_mkdir(struct user_namespace * mnt_userns,
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
- if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- else
- inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_mapping->a_ops = &ext2_aops;
inode_inc_link_count(inode);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f6a19f6d9f6d..a1c1263c07ab 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -296,9 +296,6 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",noacl");
#endif
- if (test_opt(sb, NOBH))
- seq_puts(seq, ",nobh");
-
if (test_opt(sb, USRQUOTA))
seq_puts(seq, ",usrquota");
@@ -551,7 +548,8 @@ static int parse_options(char *options, struct super_block *sb,
clear_opt (opts->s_mount_opt, OLDALLOC);
break;
case Opt_nobh:
- set_opt (opts->s_mount_opt, NOBH);
+ ext2_msg(sb, KERN_INFO,
+ "nobh option not supported");
break;
#ifdef CONFIG_EXT2_FS_XATTR
case Opt_user_xattr:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 84c0eb55071d..9fd60fc8ba4c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1554,9 +1554,9 @@ struct mpage_da_data {
static void mpage_release_unused_pages(struct mpage_da_data *mpd,
bool invalidate)
{
- int nr_pages, i;
+ unsigned nr, i;
pgoff_t index, end;
- struct pagevec pvec;
+ struct folio_batch fbatch;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
@@ -1574,15 +1574,18 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
ext4_es_remove_extent(inode, start, last - start + 1);
}
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
while (index <= end) {
- nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
- if (nr_pages == 0)
+ nr = filemap_get_folios(mapping, &index, end, &fbatch);
+ if (nr == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct folio *folio = page_folio(page);
+ for (i = 0; i < nr; i++) {
+ struct folio *folio = fbatch.folios[i];
+ if (folio->index < mpd->first_page)
+ continue;
+ if (folio->index + folio_nr_pages(folio) - 1 > end)
+ continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
if (invalidate) {
@@ -1594,7 +1597,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
}
folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
}
}
@@ -2311,8 +2314,8 @@ out:
*/
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
- struct pagevec pvec;
- int nr_pages, i;
+ struct folio_batch fbatch;
+ unsigned nr, i;
struct inode *inode = mpd->inode;
int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
@@ -2326,14 +2329,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
lblk = start << bpp_bits;
pblock = mpd->map.m_pblk;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
while (start <= end) {
- nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
- &start, end);
- if (nr_pages == 0)
+ nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
+ if (nr == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr; i++) {
+ struct page *page = &fbatch.folios[i]->page;
err = mpage_process_page(mpd, page, &lblk, &pblock,
&map_bh);
@@ -2349,14 +2351,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
if (err < 0)
goto out;
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
}
/* Extent fully mapped and matches with page boundary. We are done. */
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
out:
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
return err;
}
@@ -3631,7 +3633,7 @@ static const struct address_space_operations ext4_aops = {
.invalidate_folio = ext4_invalidate_folio,
.release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = ext4_iomap_swap_activate,
@@ -3666,7 +3668,7 @@ static const struct address_space_operations ext4_da_aops = {
.invalidate_folio = ext4_invalidate_folio,
.release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = ext4_iomap_swap_activate,
@@ -5350,14 +5352,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (error)
return error;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
error = dquot_initialize(inode);
if (error)
return error;
}
- if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+ if (i_uid_needs_update(mnt_userns, attr, inode) ||
+ i_gid_needs_update(mnt_userns, attr, inode)) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
@@ -5374,7 +5376,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* counts xattr inode references.
*/
down_read(&EXT4_I(inode)->xattr_sem);
- error = dquot_transfer(inode, attr);
+ error = dquot_transfer(mnt_userns, inode, attr);
up_read(&EXT4_I(inode)->xattr_sem);
if (error) {
@@ -5383,10 +5385,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
/* Update corresponding info in inode so that everything is in
* one transaction */
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
error = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
if (unlikely(error)) {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6d8b2bf14de0..8259e0fa97e1 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -463,9 +463,7 @@ const struct address_space_operations f2fs_meta_aops = {
.dirty_folio = f2fs_dirty_meta_folio,
.invalidate_folio = f2fs_invalidate_folio,
.release_folio = f2fs_release_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = f2fs_migrate_page,
-#endif
+ .migrate_folio = filemap_migrate_folio,
};
static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 24824cd96f36..ef88bf7d00e7 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -729,28 +729,18 @@ out:
return ret;
}
-void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
+static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool end_io)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
- struct f2fs_inode_info *fi = F2FS_I(dic->inode);
const struct f2fs_compress_ops *cops =
- f2fs_cops[fi->i_compress_algorithm];
- int ret;
+ f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
int i;
- trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
- dic->cluster_size, fi->i_compress_algorithm);
-
- if (dic->failed) {
- ret = -EIO;
- goto out_end_io;
- }
+ if (end_io ^ f2fs_low_mem_mode(F2FS_I_SB(dic->inode)))
+ return 0;
dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
- if (!dic->tpages) {
- ret = -ENOMEM;
- goto out_end_io;
- }
+ if (!dic->tpages)
+ return 1;
for (i = 0; i < dic->cluster_size; i++) {
if (dic->rpages[i]) {
@@ -759,28 +749,100 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
}
dic->tpages[i] = f2fs_compress_alloc_page();
- if (!dic->tpages[i]) {
- ret = -ENOMEM;
- goto out_end_io;
- }
+ if (!dic->tpages[i])
+ return 1;
}
+ dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
+ if (!dic->rbuf)
+ return 1;
+
+ dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
+ if (!dic->cbuf)
+ return 1;
+
+ cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
if (cops->init_decompress_ctx) {
- ret = cops->init_decompress_ctx(dic);
+ int ret = cops->init_decompress_ctx(dic);
+
if (ret)
- goto out_end_io;
+ return 1;
}
- dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
- if (!dic->rbuf) {
- ret = -ENOMEM;
- goto out_destroy_decompress_ctx;
+ return 0;
+}
+
+static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback, bool end_io)
+{
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+
+ if (end_io ^ f2fs_low_mem_mode(F2FS_I_SB(dic->inode)))
+ return;
+
+ if (!bypass_destroy_callback && cops->destroy_decompress_ctx)
+ cops->destroy_decompress_ctx(dic);
+
+ if (dic->cbuf)
+ vm_unmap_ram(dic->cbuf, dic->nr_cpages);
+
+ if (dic->rbuf)
+ vm_unmap_ram(dic->rbuf, dic->cluster_size);
+}
+
+static void f2fs_free_dic(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback)
+{
+ int i;
+
+ f2fs_release_decomp_mem(dic, bypass_destroy_callback, false);
+
+ if (dic->tpages) {
+ for (i = 0; i < dic->cluster_size; i++) {
+ if (dic->rpages[i])
+ continue;
+ if (!dic->tpages[i])
+ continue;
+ f2fs_compress_free_page(dic->tpages[i]);
+ }
+ page_array_free(dic->inode, dic->tpages, dic->cluster_size);
}
- dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
- if (!dic->cbuf) {
+ if (dic->cpages) {
+ for (i = 0; i < dic->nr_cpages; i++) {
+ if (!dic->cpages[i])
+ continue;
+ f2fs_compress_free_page(dic->cpages[i]);
+ }
+ page_array_free(dic->inode, dic->cpages, dic->nr_cpages);
+ }
+
+ page_array_free(dic->inode, dic->rpages, dic->nr_rpages);
+ kmem_cache_free(dic_entry_slab, dic);
+}
+
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+ struct f2fs_inode_info *fi = F2FS_I(dic->inode);
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[fi->i_compress_algorithm];
+ bool bypass_callback = false;
+ int ret;
+
+ trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
+ dic->cluster_size, fi->i_compress_algorithm);
+
+ if (dic->failed) {
+ ret = -EIO;
+ goto out_end_io;
+ }
+
+ if (f2fs_prepare_decomp_mem(dic, true)) {
+ bypass_callback = true;
ret = -ENOMEM;
- goto out_vunmap_rbuf;
+ goto out_release;
}
dic->clen = le32_to_cpu(dic->cbuf->clen);
@@ -788,7 +850,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
ret = -EFSCORRUPTED;
- goto out_vunmap_cbuf;
+ goto out_release;
}
ret = cops->decompress_pages(dic);
@@ -809,17 +871,13 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
}
}
-out_vunmap_cbuf:
- vm_unmap_ram(dic->cbuf, dic->nr_cpages);
-out_vunmap_rbuf:
- vm_unmap_ram(dic->rbuf, dic->cluster_size);
-out_destroy_decompress_ctx:
- if (cops->destroy_decompress_ctx)
- cops->destroy_decompress_ctx(dic);
+out_release:
+ f2fs_release_decomp_mem(dic, bypass_callback, true);
+
out_end_io:
trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
dic->clen, ret);
- f2fs_decompress_end_io(dic, ret);
+ f2fs_decompress_end_io(dic, ret, in_task);
}
/*
@@ -829,7 +887,7 @@ out_end_io:
* (or in the case of a failure, cleans up without actually decompressing).
*/
void f2fs_end_read_compressed_page(struct page *page, bool failed,
- block_t blkaddr)
+ block_t blkaddr, bool in_task)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);
@@ -839,12 +897,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed,
if (failed)
WRITE_ONCE(dic->failed, true);
- else if (blkaddr)
+ else if (blkaddr && in_task)
f2fs_cache_compressed_page(sbi, page,
dic->inode->i_ino, blkaddr);
if (atomic_dec_and_test(&dic->remaining_pages))
- f2fs_decompress_cluster(dic);
+ f2fs_decompress_cluster(dic, in_task);
}
static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@@ -1552,16 +1610,14 @@ destroy_out:
return err;
}
-static void f2fs_free_dic(struct decompress_io_ctx *dic);
-
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
{
struct decompress_io_ctx *dic;
pgoff_t start_idx = start_idx_of_cluster(cc);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
int i;
- dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO,
- false, F2FS_I_SB(cc->inode));
+ dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi);
if (!dic)
return ERR_PTR(-ENOMEM);
@@ -1602,52 +1658,43 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->cpages[i] = page;
}
+ if (f2fs_prepare_decomp_mem(dic, false))
+ goto out_free;
+
return dic;
out_free:
- f2fs_free_dic(dic);
+ f2fs_free_dic(dic, true);
return ERR_PTR(-ENOMEM);
}
-static void f2fs_free_dic(struct decompress_io_ctx *dic)
+static void f2fs_late_free_dic(struct work_struct *work)
{
- int i;
-
- if (dic->tpages) {
- for (i = 0; i < dic->cluster_size; i++) {
- if (dic->rpages[i])
- continue;
- if (!dic->tpages[i])
- continue;
- f2fs_compress_free_page(dic->tpages[i]);
- }
- page_array_free(dic->inode, dic->tpages, dic->cluster_size);
- }
-
- if (dic->cpages) {
- for (i = 0; i < dic->nr_cpages; i++) {
- if (!dic->cpages[i])
- continue;
- f2fs_compress_free_page(dic->cpages[i]);
- }
- page_array_free(dic->inode, dic->cpages, dic->nr_cpages);
- }
+ struct decompress_io_ctx *dic =
+ container_of(work, struct decompress_io_ctx, free_work);
- page_array_free(dic->inode, dic->rpages, dic->nr_rpages);
- kmem_cache_free(dic_entry_slab, dic);
+ f2fs_free_dic(dic, false);
}
-static void f2fs_put_dic(struct decompress_io_ctx *dic)
+static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
{
- if (refcount_dec_and_test(&dic->refcnt))
- f2fs_free_dic(dic);
+ if (refcount_dec_and_test(&dic->refcnt)) {
+ if (in_task) {
+ f2fs_free_dic(dic, false);
+ } else {
+ INIT_WORK(&dic->free_work, f2fs_late_free_dic);
+ queue_work(F2FS_I_SB(dic->inode)->post_read_wq,
+ &dic->free_work);
+ }
+ }
}
/*
* Update and unlock the cluster's pagecache pages, and release the reference to
* the decompress_io_ctx that was being held for I/O completion.
*/
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task)
{
int i;
@@ -1668,7 +1715,7 @@ static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
unlock_page(rpage);
}
- f2fs_put_dic(dic);
+ f2fs_put_dic(dic, in_task);
}
static void f2fs_verify_cluster(struct work_struct *work)
@@ -1685,14 +1732,15 @@ static void f2fs_verify_cluster(struct work_struct *work)
SetPageError(rpage);
}
- __f2fs_decompress_end_io(dic, false);
+ __f2fs_decompress_end_io(dic, false, true);
}
/*
* This is called when a compressed cluster has been decompressed
* (or failed to be read and/or decompressed).
*/
-void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task)
{
if (!failed && dic->need_verity) {
/*
@@ -1704,7 +1752,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
fsverity_enqueue_verify_work(&dic->verity_work);
} else {
- __f2fs_decompress_end_io(dic, failed);
+ __f2fs_decompress_end_io(dic, failed, in_task);
}
}
@@ -1713,12 +1761,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
*
* This is called when the page is no longer needed and can be freed.
*/
-void f2fs_put_page_dic(struct page *page)
+void f2fs_put_page_dic(struct page *page, bool in_task)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);
- f2fs_put_dic(dic);
+ f2fs_put_dic(dic, in_task);
}
/*
@@ -1832,45 +1880,40 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
{
struct address_space *mapping = sbi->compress_inode->i_mapping;
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index = 0;
pgoff_t end = MAX_BLKADDR(sbi);
if (!mapping->nrpages)
return;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
do {
- unsigned int nr_pages;
- int i;
+ unsigned int nr, i;
- nr_pages = pagevec_lookup_range(&pvec, mapping,
- &index, end - 1);
- if (!nr_pages)
+ nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
+ if (!nr)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr; i++) {
+ struct folio *folio = fbatch.folios[i];
- if (page->index > end)
- break;
-
- lock_page(page);
- if (page->mapping != mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
continue;
}
- if (ino != get_page_private_data(page)) {
- unlock_page(page);
+ if (ino != get_page_private_data(&folio->page)) {
+ folio_unlock(folio);
continue;
}
- generic_error_remove_page(mapping, page);
- unlock_page(page);
+ generic_error_remove_page(mapping, &folio->page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
} while (index < end);
}
@@ -1908,6 +1951,9 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
dev_t dev = sbi->sb->s_bdev->bd_dev;
char slab_name[32];
+ if (!f2fs_sb_has_compression(sbi))
+ return 0;
+
sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev));
sbi->page_array_slab_size = sizeof(struct page *) <<
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7fcbcf979737..ad91f5e28a06 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -119,7 +119,7 @@ struct bio_post_read_ctx {
block_t fs_blkaddr;
};
-static void f2fs_finish_read_bio(struct bio *bio)
+static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -133,8 +133,9 @@ static void f2fs_finish_read_bio(struct bio *bio)
if (f2fs_is_compressed_page(page)) {
if (bio->bi_status)
- f2fs_end_read_compressed_page(page, true, 0);
- f2fs_put_page_dic(page);
+ f2fs_end_read_compressed_page(page, true, 0,
+ in_task);
+ f2fs_put_page_dic(page, in_task);
continue;
}
@@ -191,7 +192,7 @@ static void f2fs_verify_bio(struct work_struct *work)
fsverity_verify_bio(bio);
}
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, true);
}
/*
@@ -203,7 +204,7 @@ static void f2fs_verify_bio(struct work_struct *work)
* can involve reading verity metadata pages from the file, and these verity
* metadata pages may be encrypted and/or compressed.
*/
-static void f2fs_verify_and_finish_bio(struct bio *bio)
+static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task)
{
struct bio_post_read_ctx *ctx = bio->bi_private;
@@ -211,7 +212,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio)
INIT_WORK(&ctx->work, f2fs_verify_bio);
fsverity_enqueue_verify_work(&ctx->work);
} else {
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, in_task);
}
}
@@ -224,7 +225,8 @@ static void f2fs_verify_and_finish_bio(struct bio *bio)
* that the bio includes at least one compressed page. The actual decompression
* is done on a per-cluster basis, not a per-bio basis.
*/
-static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
+static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
+ bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -237,7 +239,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
/* PG_error was set if decryption failed. */
if (f2fs_is_compressed_page(page))
f2fs_end_read_compressed_page(page, PageError(page),
- blkaddr);
+ blkaddr, in_task);
else
all_compressed = false;
@@ -262,15 +264,16 @@ static void f2fs_post_read_work(struct work_struct *work)
fscrypt_decrypt_bio(ctx->bio);
if (ctx->enabled_steps & STEP_DECOMPRESS)
- f2fs_handle_step_decompress(ctx);
+ f2fs_handle_step_decompress(ctx, true);
- f2fs_verify_and_finish_bio(ctx->bio);
+ f2fs_verify_and_finish_bio(ctx->bio, true);
}
static void f2fs_read_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
struct bio_post_read_ctx *ctx;
+ bool intask = in_task();
iostat_update_and_unbind_ctx(bio, 0);
ctx = bio->bi_private;
@@ -281,16 +284,29 @@ static void f2fs_read_end_io(struct bio *bio)
}
if (bio->bi_status) {
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, intask);
return;
}
- if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) {
- INIT_WORK(&ctx->work, f2fs_post_read_work);
- queue_work(ctx->sbi->post_read_wq, &ctx->work);
- } else {
- f2fs_verify_and_finish_bio(bio);
+ if (ctx) {
+ unsigned int enabled_steps = ctx->enabled_steps &
+ (STEP_DECRYPT | STEP_DECOMPRESS);
+
+ /*
+ * If we have only decompression step between decompression and
+ * decrypt, we don't need post processing for this.
+ */
+ if (enabled_steps == STEP_DECOMPRESS &&
+ !f2fs_low_mem_mode(sbi)) {
+ f2fs_handle_step_decompress(ctx, intask);
+ } else if (enabled_steps) {
+ INIT_WORK(&ctx->work, f2fs_post_read_work);
+ queue_work(ctx->sbi->post_read_wq, &ctx->work);
+ return;
+ }
}
+
+ f2fs_verify_and_finish_bio(bio, intask);
}
static void f2fs_write_end_io(struct bio *bio)
@@ -2222,7 +2238,7 @@ skip_reading_dnode:
if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
if (atomic_dec_and_test(&dic->remaining_pages))
- f2fs_decompress_cluster(dic);
+ f2fs_decompress_cluster(dic, true);
continue;
}
@@ -2240,7 +2256,7 @@ submit_and_realloc:
page->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
- f2fs_decompress_end_io(dic, ret);
+ f2fs_decompress_end_io(dic, ret, true);
f2fs_put_dnode(&dn);
*bio_ret = NULL;
return ret;
@@ -3751,42 +3767,6 @@ out:
return blknr;
}
-#ifdef CONFIG_MIGRATION
-#include <linux/migrate.h>
-
-int f2fs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
-{
- int rc, extra_count = 0;
-
- BUG_ON(PageWriteback(page));
-
- rc = migrate_page_move_mapping(mapping, newpage,
- page, extra_count);
- if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
-
- /* guarantee to start from no stale private field */
- set_page_private(newpage, 0);
- if (PagePrivate(page)) {
- set_page_private(newpage, page_private(page));
- SetPagePrivate(newpage);
- get_page(newpage);
-
- set_page_private(page, 0);
- ClearPagePrivate(page);
- put_page(page);
- }
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
-
- return MIGRATEPAGE_SUCCESS;
-}
-#endif
-
#ifdef CONFIG_SWAP
static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
unsigned int blkcnt)
@@ -4018,15 +3998,13 @@ const struct address_space_operations f2fs_dblock_aops = {
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
.dirty_folio = f2fs_dirty_data_folio,
+ .migrate_folio = filemap_migrate_folio,
.invalidate_folio = f2fs_invalidate_folio,
.release_folio = f2fs_release_folio,
.direct_IO = noop_direct_IO,
.bmap = f2fs_bmap,
.swap_activate = f2fs_swap_activate,
.swap_deactivate = f2fs_swap_deactivate,
-#ifdef CONFIG_MIGRATION
- .migratepage = f2fs_migrate_page,
-#endif
};
void f2fs_clear_page_cache_dirty_tag(struct page *page)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d9bbecd008d2..6061225a7a4d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -159,6 +159,7 @@ struct f2fs_mount_info {
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
+ int memory_mode; /* memory mode */
int discard_unit; /*
* discard command's offset/size should
* be aligned to this unit: block,
@@ -1360,6 +1361,13 @@ enum {
DISCARD_UNIT_SECTION, /* basic discard unit is section */
};
+enum {
+ MEMORY_MODE_NORMAL, /* memory mode for normal devices */
+ MEMORY_MODE_LOW, /* memory mode for low memry devices */
+};
+
+
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1580,6 +1588,7 @@ struct decompress_io_ctx {
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
struct work_struct verity_work; /* work to verify the decompressed pages */
+ struct work_struct free_work; /* work for late free this structure itself */
};
#define NULL_CLUSTER ((unsigned int)(~0))
@@ -3764,10 +3773,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
void f2fs_write_failed(struct inode *inode, loff_t to);
void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
bool f2fs_release_folio(struct folio *folio, gfp_t wait);
-#ifdef CONFIG_MIGRATION
-int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode);
-#endif
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
void f2fs_clear_page_cache_dirty_tag(struct page *page);
int f2fs_init_post_read_processing(void);
@@ -4158,9 +4163,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
bool f2fs_is_compress_backend_ready(struct inode *inode);
int f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
-void f2fs_decompress_cluster(struct decompress_io_ctx *dic);
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task);
void f2fs_end_read_compressed_page(struct page *page, bool failed,
- block_t blkaddr);
+ block_t blkaddr, bool in_task);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
@@ -4179,8 +4184,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
-void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
-void f2fs_put_page_dic(struct page *page);
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task);
+void f2fs_put_page_dic(struct page *page, bool in_task);
unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
@@ -4226,13 +4232,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
}
static inline int f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
-static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { }
+static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic,
+ bool in_task) { }
static inline void f2fs_end_read_compressed_page(struct page *page,
- bool failed, block_t blkaddr)
+ bool failed, block_t blkaddr, bool in_task)
{
WARN_ON_ONCE(1);
}
-static inline void f2fs_put_page_dic(struct page *page)
+static inline void f2fs_put_page_dic(struct page *page, bool in_task)
{
WARN_ON_ONCE(1);
}
@@ -4398,6 +4405,11 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
+static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi)
+{
+ return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW;
+}
+
static inline bool f2fs_may_compress(struct inode *inode)
{
if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bd14cef1b08f..313bdff2361c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -861,10 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
{
unsigned int ia_valid = attr->ia_valid;
- if (ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_MTIME)
@@ -917,17 +915,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (err)
return err;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
err = f2fs_dquot_initialize(inode);
if (err)
return err;
}
- if ((attr->ia_valid & ATTR_UID &&
- !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (attr->ia_valid & ATTR_GID &&
- !gid_eq(attr->ia_gid, inode->i_gid))) {
+ if (i_uid_needs_update(mnt_userns, attr, inode) ||
+ i_gid_needs_update(mnt_userns, attr, inode)) {
f2fs_lock_op(F2FS_I_SB(inode));
- err = dquot_transfer(inode, attr);
+ err = dquot_transfer(mnt_userns, inode, attr);
if (err) {
set_sbi_flag(F2FS_I_SB(inode),
SBI_QUOTA_NEED_REPAIR);
@@ -938,10 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* update uid/gid under lock_op(), so that dquot and inode can
* be updated atomically.
*/
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
f2fs_mark_inode_dirty_sync(inode, true);
f2fs_unlock_op(F2FS_I_SB(inode));
}
@@ -3903,10 +3897,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
for (i = 0; i < page_len; i++, redirty_idx++) {
page = find_lock_page(mapping, redirty_idx);
- if (!page) {
- ret = -ENOMEM;
- break;
- }
+
+ /* It will never fail, when page has pinned above */
+ f2fs_bug_on(F2FS_I_SB(inode), !page);
+
set_page_dirty(page);
f2fs_put_page(page, 1);
f2fs_put_page(page, 0);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d5fb426e0747..f4aa3c88118b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -37,7 +37,6 @@ static int gc_thread_func(void *data)
unsigned int wait_ms;
struct f2fs_gc_control gc_control = {
.victim_segno = NULL_SEGNO,
- .should_migrate_blocks = false,
.err_gc_skipped = false };
wait_ms = gc_th->min_sleep_time;
@@ -113,7 +112,10 @@ static int gc_thread_func(void *data)
sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
f2fs_down_write(&sbi->gc_lock);
+ gc_control.should_migrate_blocks = true;
goto do_gc;
+ } else {
+ gc_control.should_migrate_blocks = false;
}
if (foreground) {
@@ -139,7 +141,9 @@ do_gc:
if (!foreground)
stat_inc_bggc_count(sbi->stat_info);
- sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
+ sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC ||
+ sbi->gc_mode == GC_URGENT_HIGH ||
+ sbi->gc_mode == GC_URGENT_MID;
/* foreground GC was been triggered via f2fs_balance_fs() */
if (foreground)
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 3fe145e8e594..19b956c2d697 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -120,15 +120,13 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
return free_blks - ovp_blks;
}
-static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
+static inline block_t limit_invalid_user_blocks(block_t user_block_count)
{
- return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
+ return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100;
}
-static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
+static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks)
{
- block_t reclaimable_user_blocks = sbi->user_block_count -
- written_block_count(sbi);
return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
}
@@ -163,15 +161,16 @@ static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
{
- block_t invalid_user_blocks = sbi->user_block_count -
- written_block_count(sbi);
+ block_t user_block_count = sbi->user_block_count;
+ block_t invalid_user_blocks = user_block_count -
+ written_block_count(sbi);
/*
* Background GC is triggered with the following conditions.
* 1. There are a number of invalid blocks.
* 2. There is not enough free space.
*/
- if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
- free_user_blocks(sbi) < limit_free_user_blocks(sbi))
- return true;
- return false;
+ return (invalid_user_blocks >
+ limit_invalid_user_blocks(user_block_count) &&
+ free_user_blocks(sbi) <
+ limit_free_user_blocks(invalid_user_blocks));
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index cf6f7fc83c08..e501e70f32e8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1945,7 +1945,6 @@ next_step:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
bool submitted = false;
- bool may_dirty = true;
/* give a priority to WB_SYNC threads */
if (atomic_read(&sbi->wb_sync_req[NODE]) &&
@@ -1998,11 +1997,8 @@ continue_unlock:
}
/* flush dirty inode */
- if (IS_INODE(page) && may_dirty) {
- may_dirty = false;
- if (flush_dirty_inode(page))
- goto lock_node;
- }
+ if (IS_INODE(page) && flush_dirty_inode(page))
+ goto lock_node;
write_node:
f2fs_wait_on_page_writeback(page, NODE, true, true);
@@ -2165,9 +2161,7 @@ const struct address_space_operations f2fs_node_aops = {
.dirty_folio = f2fs_dirty_node_folio,
.invalidate_folio = f2fs_invalidate_folio,
.release_folio = f2fs_release_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = f2fs_migrate_page,
-#endif
+ .migrate_folio = filemap_migrate_folio,
};
static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 3cb7f8a43b4d..dcd0a1e35095 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -255,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page)
memset(&attr, 0, sizeof(attr));
- attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid);
- attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid);
+ attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid));
+ attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid));
- if (!uid_eq(attr.ia_uid, inode->i_uid))
+ if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode)))
attr.ia_valid |= ATTR_UID;
- if (!gid_eq(attr.ia_gid, inode->i_gid))
+ if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode)))
attr.ia_valid |= ATTR_GID;
if (!attr.ia_valid)
return 0;
- err = dquot_transfer(inode, &attr);
+ err = dquot_transfer(&init_user_ns, inode, &attr);
if (err)
set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR);
return err;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 37221e94e5ef..cf9cf24f9b56 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/sched/mm.h>
#include <linux/statfs.h>
#include <linux/buffer_head.h>
@@ -159,6 +160,7 @@ enum {
Opt_gc_merge,
Opt_nogc_merge,
Opt_discard_unit,
+ Opt_memory_mode,
Opt_err,
};
@@ -235,6 +237,7 @@ static match_table_t f2fs_tokens = {
{Opt_gc_merge, "gc_merge"},
{Opt_nogc_merge, "nogc_merge"},
{Opt_discard_unit, "discard_unit=%s"},
+ {Opt_memory_mode, "memory=%s"},
{Opt_err, NULL},
};
@@ -492,9 +495,19 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
bool is_remount)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
-#ifdef CONFIG_FS_ENCRYPTION
+ struct fs_parameter param = {
+ .type = fs_value_is_string,
+ .string = arg->from ? arg->from : "",
+ };
+ struct fscrypt_dummy_policy *policy =
+ &F2FS_OPTION(sbi).dummy_enc_policy;
int err;
+ if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
+ f2fs_warn(sbi, "test_dummy_encryption option not supported");
+ return -EINVAL;
+ }
+
if (!f2fs_sb_has_encrypt(sbi)) {
f2fs_err(sbi, "Encrypt feature is off");
return -EINVAL;
@@ -506,12 +519,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) {
+ if (is_remount && !fscrypt_is_dummy_policy_set(policy)) {
f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
return -EINVAL;
}
- err = fscrypt_set_test_dummy_encryption(
- sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy);
+
+ err = fscrypt_parse_test_dummy_encryption(&param, policy);
if (err) {
if (err == -EEXIST)
f2fs_warn(sbi,
@@ -524,12 +537,14 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
opt, err);
return -EINVAL;
}
+ err = fscrypt_add_test_dummy_key(sb, policy);
+ if (err) {
+ f2fs_warn(sbi, "Error adding test dummy encryption key [%d]",
+ err);
+ return err;
+ }
f2fs_warn(sbi, "Test dummy encryption mode enabled");
return 0;
-#else
- f2fs_warn(sbi, "test_dummy_encryption option not supported");
- return -EINVAL;
-#endif
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -1222,6 +1237,22 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
+ case Opt_memory_mode:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (!strcmp(name, "normal")) {
+ F2FS_OPTION(sbi).memory_mode =
+ MEMORY_MODE_NORMAL;
+ } else if (!strcmp(name, "low")) {
+ F2FS_OPTION(sbi).memory_mode =
+ MEMORY_MODE_LOW;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1993,6 +2024,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
seq_printf(seq, ",discard_unit=%s", "section");
+ if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL)
+ seq_printf(seq, ",memory=%s", "normal");
+ else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW)
+ seq_printf(seq, ",memory=%s", "low");
+
return 0;
}
@@ -2014,6 +2050,7 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).compress_ext_cnt = 0;
F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS;
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
+ F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL;
sbi->sb->s_flags &= ~SB_INLINECRYPT;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3dae3ed60f3a..3e4eb3467cb4 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
* out the RO attribute for checking by the security
* module, just because it maps to a file mode.
*/
- err = security_inode_setattr(file->f_path.dentry, &ia);
+ err = security_inode_setattr(file_mnt_user_ns(file),
+ file->f_path.dentry, &ia);
if (err)
goto out_unlock_inode;
@@ -516,9 +517,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
if (((attr->ia_valid & ATTR_UID) &&
- (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) ||
+ (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid),
+ sbi->options.fs_uid))) ||
((attr->ia_valid & ATTR_GID) &&
- (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) ||
+ (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid),
+ sbi->options.fs_gid))) ||
((attr->ia_valid & ATTR_MODE) &&
(attr->ia_mode & ~FAT_VALID_MODE)))
error = -EPERM;
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index c2ef9f0debbd..9b49ec36e667 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -13,16 +13,6 @@
#include "vxfs_extern.h"
#include "vxfs_inode.h"
-
-static int vxfs_immed_read_folio(struct file *, struct folio *);
-
-/*
- * Address space operations for immed files and directories.
- */
-const struct address_space_operations vxfs_immed_aops = {
- .read_folio = vxfs_immed_read_folio,
-};
-
/**
* vxfs_immed_read_folio - read part of an immed inode into pagecache
* @file: file context (unused)
@@ -30,7 +20,7 @@ const struct address_space_operations vxfs_immed_aops = {
*
* Description:
* vxfs_immed_read_folio reads a part of the immed area of the
- * file that hosts @pp into the pagecache.
+ * file that hosts @folio into the pagecache.
*
* Returns:
* Zero on success, else a negative error code.
@@ -38,21 +28,26 @@ const struct address_space_operations vxfs_immed_aops = {
* Locking status:
* @folio is locked and will be unlocked.
*/
-static int
-vxfs_immed_read_folio(struct file *fp, struct folio *folio)
+static int vxfs_immed_read_folio(struct file *fp, struct folio *folio)
{
- struct page *pp = &folio->page;
- struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host);
- u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT;
- caddr_t kaddr;
+ struct vxfs_inode_info *vip = VXFS_INO(folio->mapping->host);
+ void *src = vip->vii_immed.vi_immed + folio_pos(folio);
+ unsigned long i;
- kaddr = kmap(pp);
- memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE);
- kunmap(pp);
-
- flush_dcache_page(pp);
- SetPageUptodate(pp);
- unlock_page(pp);
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ memcpy_to_page(folio_page(folio, i), 0, src, PAGE_SIZE);
+ src += PAGE_SIZE;
+ }
+
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
+
+/*
+ * Address space operations for immed files and directories.
+ */
+const struct address_space_operations vxfs_immed_aops = {
+ .read_folio = vxfs_immed_read_folio,
+};
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 0e633d2bfc7d..c99282df7761 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -51,15 +51,9 @@ vxfs_get_page(struct address_space *mapping, u_long n)
kmap(pp);
/** if (!PageChecked(pp)) **/
/** vxfs_check_page(pp); **/
- if (PageError(pp))
- goto fail;
}
return (pp);
-
-fail:
- vxfs_put_page(pp);
- return ERR_PTR(-EIO);
}
/**
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 10eb50cbf398..e23e802a8013 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -138,9 +138,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
WARN_ON(fcd->nr_free_ranges <= 0);
fcd->nr_free_ranges--;
}
+ __kick_dmap_free_worker(fcd, 0);
spin_unlock(&fcd->lock);
- kick_dmap_free_worker(fcd, 0);
return dmap;
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 74303d6e987b..a93d675a726a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -537,6 +537,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_file *ff;
void *security_ctx = NULL;
u32 security_ctxlen;
+ bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
@@ -561,7 +562,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.mode = mode;
inarg.umask = current_umask();
- if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
+ if (fm->fc->handle_killpriv_v2 && trunc &&
!(flags & O_EXCL) && !capable(CAP_FSETID)) {
inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
}
@@ -623,6 +624,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
+ if (fm->fc->atomic_o_trunc && trunc)
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
}
return err;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 05caa2b9272e..dfee142bca5c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -210,13 +210,9 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
- truncate_pagecache(inode, 0);
file_update_time(file);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
- } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
- invalidate_inode_pages2(inode->i_mapping);
}
-
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
@@ -239,30 +235,38 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
- if (is_wb_truncate || dax_truncate) {
+ if (is_wb_truncate || dax_truncate)
inode_lock(inode);
- fuse_set_nowrite(inode);
- }
if (dax_truncate) {
filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
- goto out;
+ goto out_inode_unlock;
}
+ if (is_wb_truncate || dax_truncate)
+ fuse_set_nowrite(inode);
+
err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
-out:
+ if (is_wb_truncate || dax_truncate)
+ fuse_release_nowrite(inode);
+ if (!err) {
+ struct fuse_file *ff = file->private_data;
+
+ if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
+ }
if (dax_truncate)
filemap_invalidate_unlock(inode->i_mapping);
-
- if (is_wb_truncate | dax_truncate) {
- fuse_release_nowrite(inode);
+out_inode_unlock:
+ if (is_wb_truncate || dax_truncate)
inode_unlock(inode);
- }
return err;
}
@@ -338,6 +342,15 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /*
+ * Dirty pages might remain despite write_inode_now() call from
+ * fuse_flush() due to writes racing with the close.
+ */
+ if (fc->writeback_cache)
+ write_inode_now(inode, 1);
+
fuse_release_common(file, false);
/* return value is ignored by VFS */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 106e90a36583..57ff883d432c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -774,7 +774,7 @@ static const struct address_space_operations gfs2_aops = {
.invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
.direct_IO = noop_direct_IO,
- .migratepage = iomap_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2cceb193dcd8..d8f1239344c1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1066,8 +1066,7 @@ out_unlock:
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- if (statfs_gh)
- kfree(statfs_gh);
+ kfree(statfs_gh);
from->count = orig_count - written;
return written ? written : ret;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c992d53013d3..b3259c8e6dad 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -540,9 +540,10 @@ restart:
continue;
if (!may_grant(gl, first_gh, gh)) {
/*
- * If we get here, it means we may not grant this holder for
- * some reason. If this holder is the head of the list, it
- * means we have a blocked holder at the head, so return 1.
+ * If we get here, it means we may not grant this
+ * holder for some reason. If this holder is at the
+ * head of the list, it means we have a blocked holder
+ * at the head, so return 1.
*/
if (list_is_first(&gh->gh_list, &gl->gl_holders))
return 1;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 2559a79cf14b..6ce369b096d4 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1058,7 +1058,7 @@ restart:
/*
* Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
- * to accomodate the largest slot number. (NB dlm slot numbers start at 1,
+ * to accommodate the largest slot number. (NB dlm slot numbers start at 1,
* gfs2 jids start at 0, so jid = slot - 1)
*/
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6ba51cbb94cf..1f67d37cd225 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -452,36 +452,36 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
* @head: The journal head to start from
* @done: If set, perform only cleanup, else search and set if found.
*
- * Find the page with 'index' in the journal's mapping. Search the page for
+ * Find the folio with 'index' in the journal's mapping. Search the folio for
* the journal head if requested (cleanup == false). Release refs on the
- * page so the page cache can reclaim it (put_page() twice). We grabbed a
- * reference on this page two times, first when we did a find_or_create_page()
- * to obtain the page to add it to the bio and second when we do a
- * find_get_page() here to get the page to wait on while I/O on it is being
+ * folio so the page cache can reclaim it. We grabbed a
+ * reference on this folio twice, first when we did a find_or_create_page()
+ * to obtain the folio to add it to the bio and second when we do a
+ * filemap_get_folio() here to get the folio to wait on while I/O on it is being
* completed.
- * This function is also used to free up a page we might've grabbed but not
+ * This function is also used to free up a folio we might've grabbed but not
* used. Maybe we added it to a bio, but not submitted it for I/O. Or we
* submitted the I/O, but we already found the jhead so we only need to drop
- * our references to the page.
+ * our references to the folio.
*/
static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
struct gfs2_log_header_host *head,
bool *done)
{
- struct page *page;
+ struct folio *folio;
- page = find_get_page(jd->jd_inode->i_mapping, index);
- wait_on_page_locked(page);
+ folio = filemap_get_folio(jd->jd_inode->i_mapping, index);
- if (PageError(page))
+ folio_wait_locked(folio);
+ if (folio_test_error(folio))
*done = true;
if (!*done)
- *done = gfs2_jhead_pg_srch(jd, head, page);
+ *done = gfs2_jhead_pg_srch(jd, head, &folio->page);
- put_page(page); /* Once for find_get_page */
- put_page(page); /* Once more for find_or_create_page */
+ /* filemap_get_folio() and the earlier find_or_create_page() */
+ folio_put_refs(folio, 2);
}
static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index c0a73a6ffb28..c83fd0e8404d 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -296,10 +296,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page = read_mapping_page(mapping, block++, NULL);
if (IS_ERR(page))
goto fail;
- if (PageError(page)) {
- put_page(page);
- goto fail;
- }
node->page[i] = page;
}
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 177fae4e6581..a5ab00e54220 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -447,10 +447,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page = read_mapping_page(mapping, block, NULL);
if (IS_ERR(page))
goto fail;
- if (PageError(page)) {
- put_page(page);
- goto fail;
- }
node->page[i] = page;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index cc1bc6f93a01..07881b76d42f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -416,15 +416,15 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
if (err != count) {
- ClearPageUptodate(page);
+ if (err >= 0)
+ err = -EIO;
+ mapping_set_error(mapping, err);
goto out;
}
if (base > inode->i_size)
inode->i_size = base;
- if (PageError(page))
- ClearPageError(page);
err = 0;
out:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 02eb72351b15..20336cb3c040 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,16 +108,6 @@ static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
}
#endif
-static void huge_pagevec_release(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); ++i)
- put_page(pvec->pages[i]);
-
- pagevec_reinit(pvec);
-}
-
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
@@ -480,25 +470,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
const pgoff_t end = lend >> huge_page_shift(h);
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
next = start;
- while (next < end) {
- /*
- * When no more pages are found, we are done.
- */
- if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); ++i) {
- struct page *page = pvec.pages[i];
+ while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+ struct folio *folio = fbatch.folios[i];
u32 hash = 0;
- index = page->index;
+ index = folio->index;
if (!truncate_op) {
/*
* Only need to hold the fault mutex in the
@@ -511,15 +495,15 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}
/*
- * If page is mapped, it was faulted in after being
+ * If folio is mapped, it was faulted in after being
* unmapped in caller. Unmap (again) now after taking
* the fault mutex. The mutex will prevent faults
- * until we finish removing the page.
+ * until we finish removing the folio.
*
* This race can only happen in the hole punch case.
* Getting here in a truncate operation is a bug.
*/
- if (unlikely(page_mapped(page))) {
+ if (unlikely(folio_mapped(folio))) {
BUG_ON(truncate_op);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -532,7 +516,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
i_mmap_unlock_write(mapping);
}
- lock_page(page);
+ folio_lock(folio);
/*
* We must free the huge page and remove from page
* cache (remove_huge_page) BEFORE removing the
@@ -542,8 +526,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
* the subpool and global reserve usage count can need
* to be adjusted.
*/
- VM_BUG_ON(HPageRestoreReserve(page));
- remove_huge_page(page);
+ VM_BUG_ON(HPageRestoreReserve(&folio->page));
+ remove_huge_page(&folio->page);
freed++;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode,
@@ -551,11 +535,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
hugetlb_fix_reserve_counts(inode);
}
- unlock_page(page);
+ folio_unlock(folio);
if (!truncate_op)
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
- huge_pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
@@ -797,7 +781,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
SetHPageMigratable(page);
/*
- * unlock_page because locked by add_to_page_cache()
+ * unlock_page because locked by huge_add_to_page_cache()
* put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
@@ -1008,28 +992,33 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
return error;
}
-static int hugetlbfs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page,
+#ifdef CONFIG_MIGRATION
+static int hugetlbfs_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
int rc;
- rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+ rc = migrate_huge_page_move_mapping(mapping, dst, src);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- if (hugetlb_page_subpool(page)) {
- hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page));
- hugetlb_set_page_subpool(page, NULL);
+ if (hugetlb_page_subpool(&src->page)) {
+ hugetlb_set_page_subpool(&dst->page,
+ hugetlb_page_subpool(&src->page));
+ hugetlb_set_page_subpool(&src->page, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
+ folio_migrate_copy(dst, src);
else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(dst, src);
return MIGRATEPAGE_SUCCESS;
}
+#else
+#define hugetlbfs_migrate_folio NULL
+#endif
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
@@ -1196,7 +1185,7 @@ static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
.dirty_folio = noop_dirty_folio,
- .migratepage = hugetlbfs_migrate_page,
+ .migrate_folio = hugetlbfs_migrate_folio,
.error_remove_page = hugetlbfs_error_remove_page,
};
diff --git a/fs/inode.c b/fs/inode.c
index bd4da9c5207e..d451f30d5851 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -422,6 +422,7 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
+ INIT_LIST_HEAD(&inode->i_sb_list);
__address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
}
@@ -1021,7 +1022,6 @@ struct inode *new_inode_pseudo(struct super_block *sb)
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
- INIT_LIST_HEAD(&inode->i_sb_list);
}
return inode;
}
@@ -1165,7 +1165,6 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
struct inode *old;
- bool creating = inode->i_state & I_CREATING;
again:
spin_lock(&inode_hash_lock);
@@ -1199,7 +1198,12 @@ again:
inode->i_state |= I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- if (!creating)
+
+ /*
+ * Add inode to the sb list if it's not already. It has I_NEW at this
+ * point, so it should be safe to test i_sb_list locklessly.
+ */
+ if (list_empty(&inode->i_sb_list))
inode_sb_list_add(inode);
unlock:
spin_unlock(&inode_hash_lock);
@@ -2010,67 +2014,57 @@ static int __remove_privs(struct user_namespace *mnt_userns,
return notify_change(mnt_userns, dentry, &newattrs, NULL);
}
-/*
- * Remove special file priviledges (suid, capabilities) when file is written
- * to or truncated.
- */
-int file_remove_privs(struct file *file)
+static int __file_remove_privs(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
+ int error;
int kill;
- int error = 0;
- /*
- * Fast path for nothing security related.
- * As well for non-regular files, e.g. blkdev inodes.
- * For example, blkdev_write_iter() might get here
- * trying to remove privs which it is not allowed to.
- */
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
kill = dentry_needs_remove_privs(dentry);
- if (kill < 0)
+ if (kill <= 0)
return kill;
- if (kill)
- error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
+
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
if (!error)
inode_has_no_xattr(inode);
return error;
}
-EXPORT_SYMBOL(file_remove_privs);
/**
- * file_update_time - update mtime and ctime time
- * @file: file accessed
+ * file_remove_privs - remove special file privileges (suid, capabilities)
+ * @file: file to remove privileges from
*
- * Update the mtime and ctime members of an inode and mark the inode
- * for writeback. Note that this function is meant exclusively for
- * usage in the file write path of filesystems, and filesystems may
- * choose to explicitly ignore update via this function with the
- * S_NOCMTIME inode flag, e.g. for network filesystem where these
- * timestamps are handled by the server. This can return an error for
- * file systems who need to allocate space in order to update an inode.
+ * When file is modified by a write or truncation ensure that special
+ * file privileges are removed.
+ *
+ * Return: 0 on success, negative errno on failure.
*/
+int file_remove_privs(struct file *file)
+{
+ return __file_remove_privs(file, 0);
+}
+EXPORT_SYMBOL(file_remove_privs);
-int file_update_time(struct file *file)
+static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
{
- struct inode *inode = file_inode(file);
- struct timespec64 now;
int sync_it = 0;
- int ret;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
+ if (!timespec64_equal(&inode->i_mtime, now))
sync_it = S_MTIME;
- if (!timespec64_equal(&inode->i_ctime, &now))
+ if (!timespec64_equal(&inode->i_ctime, now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2079,37 +2073,127 @@ int file_update_time(struct file *file)
if (!sync_it)
return 0;
- /* Finally allowed to write? Takes lock. */
- if (__mnt_want_write_file(file))
- return 0;
+ return sync_it;
+}
- ret = inode_update_time(inode, &now, sync_it);
- __mnt_drop_write_file(file);
+static int __file_update_time(struct file *file, struct timespec64 *now,
+ int sync_mode)
+{
+ int ret = 0;
+ struct inode *inode = file_inode(file);
+
+ /* try to update time settings */
+ if (!__mnt_want_write_file(file)) {
+ ret = inode_update_time(inode, now, sync_mode);
+ __mnt_drop_write_file(file);
+ }
return ret;
}
+
+/**
+ * file_update_time - update mtime and ctime time
+ * @file: file accessed
+ *
+ * Update the mtime and ctime members of an inode and mark the inode for
+ * writeback. Note that this function is meant exclusively for usage in
+ * the file write path of filesystems, and filesystems may choose to
+ * explicitly ignore updates via this function with the _NOCMTIME inode
+ * flag, e.g. for network filesystem where these imestamps are handled
+ * by the server. This can return an error for file systems who need to
+ * allocate space in order to update an inode.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int file_update_time(struct file *file)
+{
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct timespec64 now = current_time(inode);
+
+ ret = inode_needs_update_time(inode, &now);
+ if (ret <= 0)
+ return ret;
+
+ return __file_update_time(file, &now, ret);
+}
EXPORT_SYMBOL(file_update_time);
-/* Caller must hold the file's inode lock */
-int file_modified(struct file *file)
+/**
+ * file_modified_flags - handle mandated vfs changes when modifying a file
+ * @file: file that was modified
+ * @flags: kiocb flags
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * If IOCB_NOWAIT is set, special file privileges will not be removed and
+ * time settings will not be updated. It will return -EAGAIN.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int file_modified_flags(struct file *file, int flags)
{
- int err;
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct timespec64 now = current_time(inode);
/*
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
- err = file_remove_privs(file);
- if (err)
- return err;
+ ret = __file_remove_privs(file, flags);
+ if (ret)
+ return ret;
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
- return file_update_time(file);
+ ret = inode_needs_update_time(inode, &now);
+ if (ret <= 0)
+ return ret;
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ return __file_update_time(file, &now, ret);
+}
+
+/**
+ * file_modified - handle mandated vfs changes when modifying a file
+ * @file: file that was modified
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int file_modified(struct file *file)
+{
+ return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);
+/**
+ * kiocb_modified - handle mandated vfs changes when modifying a file
+ * @iocb: iocb that was modified
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kiocb_modified(struct kiocb *iocb)
+{
+ return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
+}
+EXPORT_SYMBOL_GPL(kiocb_modified);
+
int inode_needs_sync(struct inode *inode)
{
if (IS_SYNC(inode))
diff --git a/fs/io-wq.c b/fs/io-wq.c
deleted file mode 100644
index 824623bcf1a5..000000000000
--- a/fs/io-wq.c
+++ /dev/null
@@ -1,1424 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Basic worker thread pool for io_uring
- *
- * Copyright (C) 2019 Jens Axboe
- *
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/sched/signal.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/rculist_nulls.h>
-#include <linux/cpu.h>
-#include <linux/task_work.h>
-#include <linux/audit.h>
-#include <uapi/linux/io_uring.h>
-
-#include "io-wq.h"
-
-#define WORKER_IDLE_TIMEOUT (5 * HZ)
-
-enum {
- IO_WORKER_F_UP = 1, /* up and active */
- IO_WORKER_F_RUNNING = 2, /* account as running */
- IO_WORKER_F_FREE = 4, /* worker on free list */
- IO_WORKER_F_BOUND = 8, /* is doing bounded work */
-};
-
-enum {
- IO_WQ_BIT_EXIT = 0, /* wq exiting */
-};
-
-enum {
- IO_ACCT_STALLED_BIT = 0, /* stalled on hash */
-};
-
-/*
- * One for each thread in a wqe pool
- */
-struct io_worker {
- refcount_t ref;
- unsigned flags;
- struct hlist_nulls_node nulls_node;
- struct list_head all_list;
- struct task_struct *task;
- struct io_wqe *wqe;
-
- struct io_wq_work *cur_work;
- struct io_wq_work *next_work;
- raw_spinlock_t lock;
-
- struct completion ref_done;
-
- unsigned long create_state;
- struct callback_head create_work;
- int create_index;
-
- union {
- struct rcu_head rcu;
- struct work_struct work;
- };
-};
-
-#if BITS_PER_LONG == 64
-#define IO_WQ_HASH_ORDER 6
-#else
-#define IO_WQ_HASH_ORDER 5
-#endif
-
-#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
-
-struct io_wqe_acct {
- unsigned nr_workers;
- unsigned max_workers;
- int index;
- atomic_t nr_running;
- raw_spinlock_t lock;
- struct io_wq_work_list work_list;
- unsigned long flags;
-};
-
-enum {
- IO_WQ_ACCT_BOUND,
- IO_WQ_ACCT_UNBOUND,
- IO_WQ_ACCT_NR,
-};
-
-/*
- * Per-node worker thread pool
- */
-struct io_wqe {
- raw_spinlock_t lock;
- struct io_wqe_acct acct[IO_WQ_ACCT_NR];
-
- int node;
-
- struct hlist_nulls_head free_list;
- struct list_head all_list;
-
- struct wait_queue_entry wait;
-
- struct io_wq *wq;
- struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
-
- cpumask_var_t cpu_mask;
-};
-
-/*
- * Per io_wq state
- */
-struct io_wq {
- unsigned long state;
-
- free_work_fn *free_work;
- io_wq_work_fn *do_work;
-
- struct io_wq_hash *hash;
-
- atomic_t worker_refs;
- struct completion worker_done;
-
- struct hlist_node cpuhp_node;
-
- struct task_struct *task;
-
- struct io_wqe *wqes[];
-};
-
-static enum cpuhp_state io_wq_online;
-
-struct io_cb_cancel_data {
- work_cancel_fn *fn;
- void *data;
- int nr_running;
- int nr_pending;
- bool cancel_all;
-};
-
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
-static void io_wqe_dec_running(struct io_worker *worker);
-static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
- struct io_wqe_acct *acct,
- struct io_cb_cancel_data *match);
-static void create_worker_cb(struct callback_head *cb);
-static void io_wq_cancel_tw_create(struct io_wq *wq);
-
-static bool io_worker_get(struct io_worker *worker)
-{
- return refcount_inc_not_zero(&worker->ref);
-}
-
-static void io_worker_release(struct io_worker *worker)
-{
- if (refcount_dec_and_test(&worker->ref))
- complete(&worker->ref_done);
-}
-
-static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound)
-{
- return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
-}
-
-static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
- struct io_wq_work *work)
-{
- return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND));
-}
-
-static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
-{
- return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND);
-}
-
-static void io_worker_ref_put(struct io_wq *wq)
-{
- if (atomic_dec_and_test(&wq->worker_refs))
- complete(&wq->worker_done);
-}
-
-static void io_worker_cancel_cb(struct io_worker *worker)
-{
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
-
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&worker->wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&worker->wqe->lock);
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
-}
-
-static bool io_task_worker_match(struct callback_head *cb, void *data)
-{
- struct io_worker *worker;
-
- if (cb->func != create_worker_cb)
- return false;
- worker = container_of(cb, struct io_worker, create_work);
- return worker == data;
-}
-
-static void io_worker_exit(struct io_worker *worker)
-{
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
-
- while (1) {
- struct callback_head *cb = task_work_cancel_match(wq->task,
- io_task_worker_match, worker);
-
- if (!cb)
- break;
- io_worker_cancel_cb(worker);
- }
-
- io_worker_release(worker);
- wait_for_completion(&worker->ref_done);
-
- raw_spin_lock(&wqe->lock);
- if (worker->flags & IO_WORKER_F_FREE)
- hlist_nulls_del_rcu(&worker->nulls_node);
- list_del_rcu(&worker->all_list);
- raw_spin_unlock(&wqe->lock);
- io_wqe_dec_running(worker);
- worker->flags = 0;
- preempt_disable();
- current->flags &= ~PF_IO_WORKER;
- preempt_enable();
-
- kfree_rcu(worker, rcu);
- io_worker_ref_put(wqe->wq);
- do_exit(0);
-}
-
-static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
-{
- bool ret = false;
-
- raw_spin_lock(&acct->lock);
- if (!wq_list_empty(&acct->work_list) &&
- !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- ret = true;
- raw_spin_unlock(&acct->lock);
-
- return ret;
-}
-
-/*
- * Check head of free list for an available worker. If one isn't available,
- * caller must create one.
- */
-static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
- struct io_wqe_acct *acct)
- __must_hold(RCU)
-{
- struct hlist_nulls_node *n;
- struct io_worker *worker;
-
- /*
- * Iterate free_list and see if we can find an idle worker to
- * activate. If a given worker is on the free_list but in the process
- * of exiting, keep trying.
- */
- hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
- if (!io_worker_get(worker))
- continue;
- if (io_wqe_get_acct(worker) != acct) {
- io_worker_release(worker);
- continue;
- }
- if (wake_up_process(worker->task)) {
- io_worker_release(worker);
- return true;
- }
- io_worker_release(worker);
- }
-
- return false;
-}
-
-/*
- * We need a worker. If we find a free one, we're good. If not, and we're
- * below the max number of workers, create one.
- */
-static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
-{
- /*
- * Most likely an attempt to queue unbounded work on an io_wq that
- * wasn't setup with any unbounded workers.
- */
- if (unlikely(!acct->max_workers))
- pr_warn_once("io-wq is not configured for unbound workers");
-
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers >= acct->max_workers) {
- raw_spin_unlock(&wqe->lock);
- return true;
- }
- acct->nr_workers++;
- raw_spin_unlock(&wqe->lock);
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- return create_io_worker(wqe->wq, wqe, acct->index);
-}
-
-static void io_wqe_inc_running(struct io_worker *worker)
-{
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
-
- atomic_inc(&acct->nr_running);
-}
-
-static void create_worker_cb(struct callback_head *cb)
-{
- struct io_worker *worker;
- struct io_wq *wq;
- struct io_wqe *wqe;
- struct io_wqe_acct *acct;
- bool do_create = false;
-
- worker = container_of(cb, struct io_worker, create_work);
- wqe = worker->wqe;
- wq = wqe->wq;
- acct = &wqe->acct[worker->create_index];
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers < acct->max_workers) {
- acct->nr_workers++;
- do_create = true;
- }
- raw_spin_unlock(&wqe->lock);
- if (do_create) {
- create_io_worker(wq, wqe, worker->create_index);
- } else {
- atomic_dec(&acct->nr_running);
- io_worker_ref_put(wq);
- }
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
-}
-
-static bool io_queue_worker_create(struct io_worker *worker,
- struct io_wqe_acct *acct,
- task_work_func_t func)
-{
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
-
- /* raced with exit, just ignore create call */
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
- goto fail;
- if (!io_worker_get(worker))
- goto fail;
- /*
- * create_state manages ownership of create_work/index. We should
- * only need one entry per worker, as the worker going to sleep
- * will trigger the condition, and waking will clear it once it
- * runs the task_work.
- */
- if (test_bit(0, &worker->create_state) ||
- test_and_set_bit_lock(0, &worker->create_state))
- goto fail_release;
-
- atomic_inc(&wq->worker_refs);
- init_task_work(&worker->create_work, func);
- worker->create_index = acct->index;
- if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
- /*
- * EXIT may have been set after checking it above, check after
- * adding the task_work and remove any creation item if it is
- * now set. wq exit does that too, but we can have added this
- * work item after we canceled in io_wq_exit_workers().
- */
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
- io_wq_cancel_tw_create(wq);
- io_worker_ref_put(wq);
- return true;
- }
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
-fail_release:
- io_worker_release(worker);
-fail:
- atomic_dec(&acct->nr_running);
- io_worker_ref_put(wq);
- return false;
-}
-
-static void io_wqe_dec_running(struct io_worker *worker)
-{
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
-
- if (!(worker->flags & IO_WORKER_F_UP))
- return;
-
- if (!atomic_dec_and_test(&acct->nr_running))
- return;
- if (!io_acct_run_queue(acct))
- return;
-
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(worker, acct, create_worker_cb);
-}
-
-/*
- * Worker will start processing some work. Move it to the busy list, if
- * it's currently on the freelist
- */
-static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
-{
- if (worker->flags & IO_WORKER_F_FREE) {
- worker->flags &= ~IO_WORKER_F_FREE;
- raw_spin_lock(&wqe->lock);
- hlist_nulls_del_init_rcu(&worker->nulls_node);
- raw_spin_unlock(&wqe->lock);
- }
-}
-
-/*
- * No work, worker going to sleep. Move to freelist, and unuse mm if we
- * have one attached. Dropping the mm may potentially sleep, so we drop
- * the lock in that case and return success. Since the caller has to
- * retry the loop in that case (we changed task state), we don't regrab
- * the lock if we return success.
- */
-static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
- __must_hold(wqe->lock)
-{
- if (!(worker->flags & IO_WORKER_F_FREE)) {
- worker->flags |= IO_WORKER_F_FREE;
- hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
- }
-}
-
-static inline unsigned int io_get_work_hash(struct io_wq_work *work)
-{
- return work->flags >> IO_WQ_HASH_SHIFT;
-}
-
-static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
-{
- struct io_wq *wq = wqe->wq;
- bool ret = false;
-
- spin_lock_irq(&wq->hash->wait.lock);
- if (list_empty(&wqe->wait.entry)) {
- __add_wait_queue(&wq->hash->wait, &wqe->wait);
- if (!test_bit(hash, &wq->hash->map)) {
- __set_current_state(TASK_RUNNING);
- list_del_init(&wqe->wait.entry);
- ret = true;
- }
- }
- spin_unlock_irq(&wq->hash->wait.lock);
- return ret;
-}
-
-static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
- struct io_worker *worker)
- __must_hold(acct->lock)
-{
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work, *tail;
- unsigned int stall_hash = -1U;
- struct io_wqe *wqe = worker->wqe;
-
- wq_list_for_each(node, prev, &acct->work_list) {
- unsigned int hash;
-
- work = container_of(node, struct io_wq_work, list);
-
- /* not hashed, can run anytime */
- if (!io_wq_is_hashed(work)) {
- wq_list_del(&acct->work_list, node, prev);
- return work;
- }
-
- hash = io_get_work_hash(work);
- /* all items with this hash lie in [work, tail] */
- tail = wqe->hash_tail[hash];
-
- /* hashed, can run if not already running */
- if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
- wqe->hash_tail[hash] = NULL;
- wq_list_cut(&acct->work_list, &tail->list, prev);
- return work;
- }
- if (stall_hash == -1U)
- stall_hash = hash;
- /* fast forward to a next hash, for-each will fix up @prev */
- node = &tail->list;
- }
-
- if (stall_hash != -1U) {
- bool unstalled;
-
- /*
- * Set this before dropping the lock to avoid racing with new
- * work being added and clearing the stalled bit.
- */
- set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&acct->lock);
- unstalled = io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&acct->lock);
- if (unstalled) {
- clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- if (wq_has_sleeper(&wqe->wq->hash->wait))
- wake_up(&wqe->wq->hash->wait);
- }
- }
-
- return NULL;
-}
-
-static bool io_flush_signals(void)
-{
- if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
- __set_current_state(TASK_RUNNING);
- clear_notify_signal();
- if (task_work_pending(current))
- task_work_run();
- return true;
- }
- return false;
-}
-
-static void io_assign_current_work(struct io_worker *worker,
- struct io_wq_work *work)
-{
- if (work) {
- io_flush_signals();
- cond_resched();
- }
-
- raw_spin_lock(&worker->lock);
- worker->cur_work = work;
- worker->next_work = NULL;
- raw_spin_unlock(&worker->lock);
-}
-
-static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
-
-static void io_worker_handle_work(struct io_worker *worker)
-{
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
- bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
-
- do {
- struct io_wq_work *work;
-
- /*
- * If we got some work, mark us as busy. If we didn't, but
- * the list isn't empty, it means we stalled on hashed work.
- * Mark us stalled so we don't keep looking for work when we
- * can't make progress, any work completion or insertion will
- * clear the stalled flag.
- */
- raw_spin_lock(&acct->lock);
- work = io_get_next_work(acct, worker);
- raw_spin_unlock(&acct->lock);
- if (work) {
- __io_worker_busy(wqe, worker);
-
- /*
- * Make sure cancelation can find this, even before
- * it becomes the active work. That avoids a window
- * where the work has been removed from our general
- * work list, but isn't yet discoverable as the
- * current work item for this worker.
- */
- raw_spin_lock(&worker->lock);
- worker->next_work = work;
- raw_spin_unlock(&worker->lock);
- } else {
- break;
- }
- io_assign_current_work(worker, work);
- __set_current_state(TASK_RUNNING);
-
- /* handle a whole dependent link */
- do {
- struct io_wq_work *next_hashed, *linked;
- unsigned int hash = io_get_work_hash(work);
-
- next_hashed = wq_next_work(work);
-
- if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
- work->flags |= IO_WQ_WORK_CANCEL;
- wq->do_work(work);
- io_assign_current_work(worker, NULL);
-
- linked = wq->free_work(work);
- work = next_hashed;
- if (!work && linked && !io_wq_is_hashed(linked)) {
- work = linked;
- linked = NULL;
- }
- io_assign_current_work(worker, work);
- if (linked)
- io_wqe_enqueue(wqe, linked);
-
- if (hash != -1U && !next_hashed) {
- /* serialize hash clear with wake_up() */
- spin_lock_irq(&wq->hash->wait.lock);
- clear_bit(hash, &wq->hash->map);
- clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- spin_unlock_irq(&wq->hash->wait.lock);
- if (wq_has_sleeper(&wq->hash->wait))
- wake_up(&wq->hash->wait);
- }
- } while (work);
- } while (1);
-}
-
-static int io_wqe_worker(void *data)
-{
- struct io_worker *worker = data;
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
- bool last_timeout = false;
- char buf[TASK_COMM_LEN];
-
- worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
-
- snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
- set_task_comm(current, buf);
-
- audit_alloc_kernel(current);
-
- while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- long ret;
-
- set_current_state(TASK_INTERRUPTIBLE);
- while (io_acct_run_queue(acct))
- io_worker_handle_work(worker);
-
- raw_spin_lock(&wqe->lock);
- /* timed out, exit unless we're the last worker */
- if (last_timeout && acct->nr_workers > 1) {
- acct->nr_workers--;
- raw_spin_unlock(&wqe->lock);
- __set_current_state(TASK_RUNNING);
- break;
- }
- last_timeout = false;
- __io_worker_idle(wqe, worker);
- raw_spin_unlock(&wqe->lock);
- if (io_flush_signals())
- continue;
- ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
- if (signal_pending(current)) {
- struct ksignal ksig;
-
- if (!get_signal(&ksig))
- continue;
- break;
- }
- last_timeout = !ret;
- }
-
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
- io_worker_handle_work(worker);
-
- audit_free(current);
- io_worker_exit(worker);
- return 0;
-}
-
-/*
- * Called when a worker is scheduled in. Mark us as currently running.
- */
-void io_wq_worker_running(struct task_struct *tsk)
-{
- struct io_worker *worker = tsk->worker_private;
-
- if (!worker)
- return;
- if (!(worker->flags & IO_WORKER_F_UP))
- return;
- if (worker->flags & IO_WORKER_F_RUNNING)
- return;
- worker->flags |= IO_WORKER_F_RUNNING;
- io_wqe_inc_running(worker);
-}
-
-/*
- * Called when worker is going to sleep. If there are no workers currently
- * running and we have work pending, wake up a free one or create a new one.
- */
-void io_wq_worker_sleeping(struct task_struct *tsk)
-{
- struct io_worker *worker = tsk->worker_private;
-
- if (!worker)
- return;
- if (!(worker->flags & IO_WORKER_F_UP))
- return;
- if (!(worker->flags & IO_WORKER_F_RUNNING))
- return;
-
- worker->flags &= ~IO_WORKER_F_RUNNING;
- io_wqe_dec_running(worker);
-}
-
-static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
- struct task_struct *tsk)
-{
- tsk->worker_private = worker;
- worker->task = tsk;
- set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
- tsk->flags |= PF_NO_SETAFFINITY;
-
- raw_spin_lock(&wqe->lock);
- hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
- list_add_tail_rcu(&worker->all_list, &wqe->all_list);
- worker->flags |= IO_WORKER_F_FREE;
- raw_spin_unlock(&wqe->lock);
- wake_up_new_task(tsk);
-}
-
-static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
-{
- return true;
-}
-
-static inline bool io_should_retry_thread(long err)
-{
- /*
- * Prevent perpetual task_work retry, if the task (or its group) is
- * exiting.
- */
- if (fatal_signal_pending(current))
- return false;
-
- switch (err) {
- case -EAGAIN:
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- case -ERESTARTNOHAND:
- return true;
- default:
- return false;
- }
-}
-
-static void create_worker_cont(struct callback_head *cb)
-{
- struct io_worker *worker;
- struct task_struct *tsk;
- struct io_wqe *wqe;
-
- worker = container_of(cb, struct io_worker, create_work);
- clear_bit_unlock(0, &worker->create_state);
- wqe = worker->wqe;
- tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
- if (!IS_ERR(tsk)) {
- io_init_new_worker(wqe, worker, tsk);
- io_worker_release(worker);
- return;
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
-
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&wqe->lock);
- acct->nr_workers--;
- if (!acct->nr_workers) {
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_all,
- .cancel_all = true,
- };
-
- raw_spin_unlock(&wqe->lock);
- while (io_acct_cancel_pending_work(wqe, acct, &match))
- ;
- } else {
- raw_spin_unlock(&wqe->lock);
- }
- io_worker_ref_put(wqe->wq);
- kfree(worker);
- return;
- }
-
- /* re-create attempts grab a new worker ref, drop the existing one */
- io_worker_release(worker);
- schedule_work(&worker->work);
-}
-
-static void io_workqueue_create(struct work_struct *work)
-{
- struct io_worker *worker = container_of(work, struct io_worker, work);
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
-
- if (!io_queue_worker_create(worker, acct, create_worker_cont))
- kfree(worker);
-}
-
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
-{
- struct io_wqe_acct *acct = &wqe->acct[index];
- struct io_worker *worker;
- struct task_struct *tsk;
-
- __set_current_state(TASK_RUNNING);
-
- worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
- if (!worker) {
-fail:
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&wqe->lock);
- io_worker_ref_put(wq);
- return false;
- }
-
- refcount_set(&worker->ref, 1);
- worker->wqe = wqe;
- raw_spin_lock_init(&worker->lock);
- init_completion(&worker->ref_done);
-
- if (index == IO_WQ_ACCT_BOUND)
- worker->flags |= IO_WORKER_F_BOUND;
-
- tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
- if (!IS_ERR(tsk)) {
- io_init_new_worker(wqe, worker, tsk);
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
- kfree(worker);
- goto fail;
- } else {
- INIT_WORK(&worker->work, io_workqueue_create);
- schedule_work(&worker->work);
- }
-
- return true;
-}
-
-/*
- * Iterate the passed in list and call the specific function for each
- * worker that isn't exiting
- */
-static bool io_wq_for_each_worker(struct io_wqe *wqe,
- bool (*func)(struct io_worker *, void *),
- void *data)
-{
- struct io_worker *worker;
- bool ret = false;
-
- list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
- if (io_worker_get(worker)) {
- /* no task if node is/was offline */
- if (worker->task)
- ret = func(worker, data);
- io_worker_release(worker);
- if (ret)
- break;
- }
- }
-
- return ret;
-}
-
-static bool io_wq_worker_wake(struct io_worker *worker, void *data)
-{
- __set_notify_signal(worker->task);
- wake_up_process(worker->task);
- return false;
-}
-
-static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
-{
- struct io_wq *wq = wqe->wq;
-
- do {
- work->flags |= IO_WQ_WORK_CANCEL;
- wq->do_work(work);
- work = wq->free_work(work);
- } while (work);
-}
-
-static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
-{
- struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- unsigned int hash;
- struct io_wq_work *tail;
-
- if (!io_wq_is_hashed(work)) {
-append:
- wq_list_add_tail(&work->list, &acct->work_list);
- return;
- }
-
- hash = io_get_work_hash(work);
- tail = wqe->hash_tail[hash];
- wqe->hash_tail[hash] = work;
- if (!tail)
- goto append;
-
- wq_list_add_after(&work->list, &tail->list, &acct->work_list);
-}
-
-static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
-{
- return work == data;
-}
-
-static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
-{
- struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- struct io_cb_cancel_data match;
- unsigned work_flags = work->flags;
- bool do_create;
-
- /*
- * If io-wq is exiting for this task, or if the request has explicitly
- * been marked as one that should not get executed, cancel it here.
- */
- if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
- (work->flags & IO_WQ_WORK_CANCEL)) {
- io_run_cancel(work, wqe);
- return;
- }
-
- raw_spin_lock(&acct->lock);
- io_wqe_insert_work(wqe, work);
- clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&acct->lock);
-
- raw_spin_lock(&wqe->lock);
- rcu_read_lock();
- do_create = !io_wqe_activate_free_worker(wqe, acct);
- rcu_read_unlock();
-
- raw_spin_unlock(&wqe->lock);
-
- if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running))) {
- bool did_create;
-
- did_create = io_wqe_create_worker(wqe, acct);
- if (likely(did_create))
- return;
-
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers) {
- raw_spin_unlock(&wqe->lock);
- return;
- }
- raw_spin_unlock(&wqe->lock);
-
- /* fatal condition, failed to create the first worker */
- match.fn = io_wq_work_match_item,
- match.data = work,
- match.cancel_all = false,
-
- io_acct_cancel_pending_work(wqe, acct, &match);
- }
-}
-
-void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
-{
- struct io_wqe *wqe = wq->wqes[numa_node_id()];
-
- io_wqe_enqueue(wqe, work);
-}
-
-/*
- * Work items that hash to the same value will not be done in parallel.
- * Used to limit concurrent writes, generally hashed by inode.
- */
-void io_wq_hash_work(struct io_wq_work *work, void *val)
-{
- unsigned int bit;
-
- bit = hash_ptr(val, IO_WQ_HASH_ORDER);
- work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
-}
-
-static bool __io_wq_worker_cancel(struct io_worker *worker,
- struct io_cb_cancel_data *match,
- struct io_wq_work *work)
-{
- if (work && match->fn(work, match->data)) {
- work->flags |= IO_WQ_WORK_CANCEL;
- __set_notify_signal(worker->task);
- return true;
- }
-
- return false;
-}
-
-static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
-{
- struct io_cb_cancel_data *match = data;
-
- /*
- * Hold the lock to avoid ->cur_work going out of scope, caller
- * may dereference the passed in work.
- */
- raw_spin_lock(&worker->lock);
- if (__io_wq_worker_cancel(worker, match, worker->cur_work) ||
- __io_wq_worker_cancel(worker, match, worker->next_work))
- match->nr_running++;
- raw_spin_unlock(&worker->lock);
-
- return match->nr_running && !match->cancel_all;
-}
-
-static inline void io_wqe_remove_pending(struct io_wqe *wqe,
- struct io_wq_work *work,
- struct io_wq_work_node *prev)
-{
- struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- unsigned int hash = io_get_work_hash(work);
- struct io_wq_work *prev_work = NULL;
-
- if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
- if (prev)
- prev_work = container_of(prev, struct io_wq_work, list);
- if (prev_work && io_get_work_hash(prev_work) == hash)
- wqe->hash_tail[hash] = prev_work;
- else
- wqe->hash_tail[hash] = NULL;
- }
- wq_list_del(&acct->work_list, &work->list, prev);
-}
-
-static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
- struct io_wqe_acct *acct,
- struct io_cb_cancel_data *match)
-{
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
-
- raw_spin_lock(&acct->lock);
- wq_list_for_each(node, prev, &acct->work_list) {
- work = container_of(node, struct io_wq_work, list);
- if (!match->fn(work, match->data))
- continue;
- io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock(&acct->lock);
- io_run_cancel(work, wqe);
- match->nr_pending++;
- /* not safe to continue after unlock */
- return true;
- }
- raw_spin_unlock(&acct->lock);
-
- return false;
-}
-
-static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match)
-{
- int i;
-retry:
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
-
- if (io_acct_cancel_pending_work(wqe, acct, match)) {
- if (match->cancel_all)
- goto retry;
- break;
- }
- }
-}
-
-static void io_wqe_cancel_running_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match)
-{
- rcu_read_lock();
- io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
- rcu_read_unlock();
-}
-
-enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data, bool cancel_all)
-{
- struct io_cb_cancel_data match = {
- .fn = cancel,
- .data = data,
- .cancel_all = cancel_all,
- };
- int node;
-
- /*
- * First check pending list, if we're lucky we can just remove it
- * from there. CANCEL_OK means that the work is returned as-new,
- * no completion will be posted for it.
- *
- * Then check if a free (going busy) or busy worker has the work
- * currently running. If we find it there, we'll return CANCEL_RUNNING
- * as an indication that we attempt to signal cancellation. The
- * completion will run normally in this case.
- *
- * Do both of these while holding the wqe->lock, to ensure that
- * we'll find a work item regardless of state.
- */
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- io_wqe_cancel_pending_work(wqe, &match);
- if (match.nr_pending && !match.cancel_all)
- return IO_WQ_CANCEL_OK;
-
- raw_spin_lock(&wqe->lock);
- io_wqe_cancel_running_work(wqe, &match);
- raw_spin_unlock(&wqe->lock);
- if (match.nr_running && !match.cancel_all)
- return IO_WQ_CANCEL_RUNNING;
- }
-
- if (match.nr_running)
- return IO_WQ_CANCEL_RUNNING;
- if (match.nr_pending)
- return IO_WQ_CANCEL_OK;
- return IO_WQ_CANCEL_NOTFOUND;
-}
-
-static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
- int sync, void *key)
-{
- struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
- int i;
-
- list_del_init(&wait->entry);
-
- rcu_read_lock();
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- struct io_wqe_acct *acct = &wqe->acct[i];
-
- if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- io_wqe_activate_free_worker(wqe, acct);
- }
- rcu_read_unlock();
- return 1;
-}
-
-struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
-{
- int ret, node, i;
- struct io_wq *wq;
-
- if (WARN_ON_ONCE(!data->free_work || !data->do_work))
- return ERR_PTR(-EINVAL);
- if (WARN_ON_ONCE(!bounded))
- return ERR_PTR(-EINVAL);
-
- wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
- if (!wq)
- return ERR_PTR(-ENOMEM);
- ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- if (ret)
- goto err_wq;
-
- refcount_inc(&data->hash->refs);
- wq->hash = data->hash;
- wq->free_work = data->free_work;
- wq->do_work = data->do_work;
-
- ret = -ENOMEM;
- for_each_node(node) {
- struct io_wqe *wqe;
- int alloc_node = node;
-
- if (!node_online(alloc_node))
- alloc_node = NUMA_NO_NODE;
- wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
- if (!wqe)
- goto err;
- if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
- goto err;
- cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
- wq->wqes[node] = wqe;
- wqe->node = alloc_node;
- wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
- wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
- task_rlimit(current, RLIMIT_NPROC);
- INIT_LIST_HEAD(&wqe->wait.entry);
- wqe->wait.func = io_wqe_hash_wake;
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- struct io_wqe_acct *acct = &wqe->acct[i];
-
- acct->index = i;
- atomic_set(&acct->nr_running, 0);
- INIT_WQ_LIST(&acct->work_list);
- raw_spin_lock_init(&acct->lock);
- }
- wqe->wq = wq;
- raw_spin_lock_init(&wqe->lock);
- INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
- INIT_LIST_HEAD(&wqe->all_list);
- }
-
- wq->task = get_task_struct(data->task);
- atomic_set(&wq->worker_refs, 1);
- init_completion(&wq->worker_done);
- return wq;
-err:
- io_wq_put_hash(data->hash);
- cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- for_each_node(node) {
- if (!wq->wqes[node])
- continue;
- free_cpumask_var(wq->wqes[node]->cpu_mask);
- kfree(wq->wqes[node]);
- }
-err_wq:
- kfree(wq);
- return ERR_PTR(ret);
-}
-
-static bool io_task_work_match(struct callback_head *cb, void *data)
-{
- struct io_worker *worker;
-
- if (cb->func != create_worker_cb && cb->func != create_worker_cont)
- return false;
- worker = container_of(cb, struct io_worker, create_work);
- return worker->wqe->wq == data;
-}
-
-void io_wq_exit_start(struct io_wq *wq)
-{
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
-}
-
-static void io_wq_cancel_tw_create(struct io_wq *wq)
-{
- struct callback_head *cb;
-
- while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
- struct io_worker *worker;
-
- worker = container_of(cb, struct io_worker, create_work);
- io_worker_cancel_cb(worker);
- }
-}
-
-static void io_wq_exit_workers(struct io_wq *wq)
-{
- int node;
-
- if (!wq->task)
- return;
-
- io_wq_cancel_tw_create(wq);
-
- rcu_read_lock();
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
- }
- rcu_read_unlock();
- io_worker_ref_put(wq);
- wait_for_completion(&wq->worker_done);
-
- for_each_node(node) {
- spin_lock_irq(&wq->hash->wait.lock);
- list_del_init(&wq->wqes[node]->wait.entry);
- spin_unlock_irq(&wq->hash->wait.lock);
- }
- put_task_struct(wq->task);
- wq->task = NULL;
-}
-
-static void io_wq_destroy(struct io_wq *wq)
-{
- int node;
-
- cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_all,
- .cancel_all = true,
- };
- io_wqe_cancel_pending_work(wqe, &match);
- free_cpumask_var(wqe->cpu_mask);
- kfree(wqe);
- }
- io_wq_put_hash(wq->hash);
- kfree(wq);
-}
-
-void io_wq_put_and_exit(struct io_wq *wq)
-{
- WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
-
- io_wq_exit_workers(wq);
- io_wq_destroy(wq);
-}
-
-struct online_data {
- unsigned int cpu;
- bool online;
-};
-
-static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
-{
- struct online_data *od = data;
-
- if (od->online)
- cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
- else
- cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
- return false;
-}
-
-static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
-{
- struct online_data od = {
- .cpu = cpu,
- .online = online
- };
- int i;
-
- rcu_read_lock();
- for_each_node(i)
- io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
- rcu_read_unlock();
- return 0;
-}
-
-static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
-{
- struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
-
- return __io_wq_cpu_online(wq, cpu, true);
-}
-
-static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
-{
- struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
-
- return __io_wq_cpu_online(wq, cpu, false);
-}
-
-int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
-{
- int i;
-
- rcu_read_lock();
- for_each_node(i) {
- struct io_wqe *wqe = wq->wqes[i];
-
- if (mask)
- cpumask_copy(wqe->cpu_mask, mask);
- else
- cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
- }
- rcu_read_unlock();
- return 0;
-}
-
-/*
- * Set max number of unbounded workers, returns old value. If new_count is 0,
- * then just return the old value.
- */
-int io_wq_max_workers(struct io_wq *wq, int *new_count)
-{
- int prev[IO_WQ_ACCT_NR];
- bool first_node = true;
- int i, node;
-
- BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND);
- BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
- BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
-
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
- new_count[i] = task_rlimit(current, RLIMIT_NPROC);
- }
-
- for (i = 0; i < IO_WQ_ACCT_NR; i++)
- prev[i] = 0;
-
- rcu_read_lock();
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
- struct io_wqe_acct *acct;
-
- raw_spin_lock(&wqe->lock);
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- acct = &wqe->acct[i];
- if (first_node)
- prev[i] = max_t(int, acct->max_workers, prev[i]);
- if (new_count[i])
- acct->max_workers = new_count[i];
- }
- raw_spin_unlock(&wqe->lock);
- first_node = false;
- }
- rcu_read_unlock();
-
- for (i = 0; i < IO_WQ_ACCT_NR; i++)
- new_count[i] = prev[i];
-
- return 0;
-}
-
-static __init int io_wq_init(void)
-{
- int ret;
-
- ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
- io_wq_cpu_online, io_wq_cpu_offline);
- if (ret < 0)
- return ret;
- io_wq_online = ret;
- return 0;
-}
-subsys_initcall(io_wq_init);
diff --git a/fs/io-wq.h b/fs/io-wq.h
deleted file mode 100644
index ba6eee76d028..000000000000
--- a/fs/io-wq.h
+++ /dev/null
@@ -1,228 +0,0 @@
-#ifndef INTERNAL_IO_WQ_H
-#define INTERNAL_IO_WQ_H
-
-#include <linux/refcount.h>
-
-struct io_wq;
-
-enum {
- IO_WQ_WORK_CANCEL = 1,
- IO_WQ_WORK_HASHED = 2,
- IO_WQ_WORK_UNBOUND = 4,
- IO_WQ_WORK_CONCURRENT = 16,
-
- IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
-};
-
-enum io_wq_cancel {
- IO_WQ_CANCEL_OK, /* cancelled before started */
- IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
- IO_WQ_CANCEL_NOTFOUND, /* work not found */
-};
-
-struct io_wq_work_node {
- struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
- struct io_wq_work_node *first;
- struct io_wq_work_node *last;
-};
-
-#define wq_list_for_each(pos, prv, head) \
- for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
-
-#define wq_list_for_each_resume(pos, prv) \
- for (; pos; prv = pos, pos = (pos)->next)
-
-#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list) do { \
- (list)->first = NULL; \
-} while (0)
-
-static inline void wq_list_add_after(struct io_wq_work_node *node,
- struct io_wq_work_node *pos,
- struct io_wq_work_list *list)
-{
- struct io_wq_work_node *next = pos->next;
-
- pos->next = node;
- node->next = next;
- if (!next)
- list->last = node;
-}
-
-/**
- * wq_list_merge - merge the second list to the first one.
- * @list0: the first list
- * @list1: the second list
- * Return the first node after mergence.
- */
-static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
- struct io_wq_work_list *list1)
-{
- struct io_wq_work_node *ret;
-
- if (!list0->first) {
- ret = list1->first;
- } else {
- ret = list0->first;
- list0->last->next = list1->first;
- }
- INIT_WQ_LIST(list0);
- INIT_WQ_LIST(list1);
- return ret;
-}
-
-static inline void wq_list_add_tail(struct io_wq_work_node *node,
- struct io_wq_work_list *list)
-{
- node->next = NULL;
- if (!list->first) {
- list->last = node;
- WRITE_ONCE(list->first, node);
- } else {
- list->last->next = node;
- list->last = node;
- }
-}
-
-static inline void wq_list_add_head(struct io_wq_work_node *node,
- struct io_wq_work_list *list)
-{
- node->next = list->first;
- if (!node->next)
- list->last = node;
- WRITE_ONCE(list->first, node);
-}
-
-static inline void wq_list_cut(struct io_wq_work_list *list,
- struct io_wq_work_node *last,
- struct io_wq_work_node *prev)
-{
- /* first in the list, if prev==NULL */
- if (!prev)
- WRITE_ONCE(list->first, last->next);
- else
- prev->next = last->next;
-
- if (last == list->last)
- list->last = prev;
- last->next = NULL;
-}
-
-static inline void __wq_list_splice(struct io_wq_work_list *list,
- struct io_wq_work_node *to)
-{
- list->last->next = to->next;
- to->next = list->first;
- INIT_WQ_LIST(list);
-}
-
-static inline bool wq_list_splice(struct io_wq_work_list *list,
- struct io_wq_work_node *to)
-{
- if (!wq_list_empty(list)) {
- __wq_list_splice(list, to);
- return true;
- }
- return false;
-}
-
-static inline void wq_stack_add_head(struct io_wq_work_node *node,
- struct io_wq_work_node *stack)
-{
- node->next = stack->next;
- stack->next = node;
-}
-
-static inline void wq_list_del(struct io_wq_work_list *list,
- struct io_wq_work_node *node,
- struct io_wq_work_node *prev)
-{
- wq_list_cut(list, node, prev);
-}
-
-static inline
-struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
-{
- struct io_wq_work_node *node = stack->next;
-
- stack->next = node->next;
- return node;
-}
-
-struct io_wq_work {
- struct io_wq_work_node list;
- unsigned flags;
- int cancel_seq;
-};
-
-static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
-{
- if (!work->list.next)
- return NULL;
-
- return container_of(work->list.next, struct io_wq_work, list);
-}
-
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work *);
-
-struct io_wq_hash {
- refcount_t refs;
- unsigned long map;
- struct wait_queue_head wait;
-};
-
-static inline void io_wq_put_hash(struct io_wq_hash *hash)
-{
- if (refcount_dec_and_test(&hash->refs))
- kfree(hash);
-}
-
-struct io_wq_data {
- struct io_wq_hash *hash;
- struct task_struct *task;
- io_wq_work_fn *do_work;
- free_work_fn *free_work;
-};
-
-struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
-void io_wq_exit_start(struct io_wq *wq);
-void io_wq_put_and_exit(struct io_wq *wq);
-
-void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
-void io_wq_hash_work(struct io_wq_work *work, void *val);
-
-int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
-int io_wq_max_workers(struct io_wq *wq, int *new_count);
-
-static inline bool io_wq_is_hashed(struct io_wq_work *work)
-{
- return work->flags & IO_WQ_WORK_HASHED;
-}
-
-typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
-
-enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data, bool cancel_all);
-
-#if defined(CONFIG_IO_WQ)
-extern void io_wq_worker_sleeping(struct task_struct *);
-extern void io_wq_worker_running(struct task_struct *);
-#else
-static inline void io_wq_worker_sleeping(struct task_struct *tsk)
-{
-}
-static inline void io_wq_worker_running(struct task_struct *tsk)
-{
-}
-#endif
-
-static inline bool io_wq_current_is_worker(void)
-{
- return in_task() && (current->flags & PF_IO_WORKER) &&
- current->worker_private;
-}
-#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
deleted file mode 100644
index 5ff2cdb425bc..000000000000
--- a/fs/io_uring.c
+++ /dev/null
@@ -1,13257 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Shared application/kernel submission and completion ring pairs, for
- * supporting fast/efficient IO.
- *
- * A note on the read/write ordering memory barriers that are matched between
- * the application and kernel side.
- *
- * After the application reads the CQ ring tail, it must use an
- * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
- * before writing the tail (using smp_load_acquire to read the tail will
- * do). It also needs a smp_mb() before updating CQ head (ordering the
- * entry load(s) with the head store), pairing with an implicit barrier
- * through a control-dependency in io_get_cqe (smp_store_release to
- * store head will do). Failure to do so could lead to reading invalid
- * CQ entries.
- *
- * Likewise, the application must use an appropriate smp_wmb() before
- * writing the SQ tail (ordering SQ entry stores with the tail store),
- * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
- * to store the tail will do). And it needs a barrier ordering the SQ
- * head load before writing new SQ entries (smp_load_acquire to read
- * head will do).
- *
- * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
- * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
- * updating the SQ tail; a full memory barrier smp_mb() is needed
- * between.
- *
- * Also see the examples in the liburing library:
- *
- * git://git.kernel.dk/liburing
- *
- * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
- * from data shared between the kernel and application. This is done both
- * for ordering purposes, but also to ensure that once a value is loaded from
- * data that the application could potentially modify, it remains stable.
- *
- * Copyright (C) 2018-2019 Jens Axboe
- * Copyright (c) 2018-2019 Christoph Hellwig
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/syscalls.h>
-#include <linux/compat.h>
-#include <net/compat.h>
-#include <linux/refcount.h>
-#include <linux/uio.h>
-#include <linux/bits.h>
-
-#include <linux/sched/signal.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/fdtable.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/blk-mq.h>
-#include <linux/bvec.h>
-#include <linux/net.h>
-#include <net/sock.h>
-#include <net/af_unix.h>
-#include <net/scm.h>
-#include <linux/anon_inodes.h>
-#include <linux/sched/mm.h>
-#include <linux/uaccess.h>
-#include <linux/nospec.h>
-#include <linux/sizes.h>
-#include <linux/hugetlb.h>
-#include <linux/highmem.h>
-#include <linux/namei.h>
-#include <linux/fsnotify.h>
-#include <linux/fadvise.h>
-#include <linux/eventpoll.h>
-#include <linux/splice.h>
-#include <linux/task_work.h>
-#include <linux/pagemap.h>
-#include <linux/io_uring.h>
-#include <linux/audit.h>
-#include <linux/security.h>
-#include <linux/xattr.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/io_uring.h>
-
-#include <uapi/linux/io_uring.h>
-
-#include "internal.h"
-#include "io-wq.h"
-
-#define IORING_MAX_ENTRIES 32768
-#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
-#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
-
-/* only define max */
-#define IORING_MAX_FIXED_FILES (1U << 20)
-#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
- IORING_REGISTER_LAST + IORING_OP_LAST)
-
-#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
-#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
-#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
-
-#define IORING_MAX_REG_BUFFERS (1U << 14)
-
-#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC)
-
-#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
- IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
-
-#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
- REQ_F_ASYNC_DATA)
-
-#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
- IO_REQ_CLEAN_FLAGS)
-
-#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
-
-#define IO_TCTX_REFS_CACHE_NR (1U << 10)
-
-struct io_uring {
- u32 head ____cacheline_aligned_in_smp;
- u32 tail ____cacheline_aligned_in_smp;
-};
-
-/*
- * This data is shared with the application through the mmap at offsets
- * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
- *
- * The offsets to the member fields are published through struct
- * io_sqring_offsets when calling io_uring_setup.
- */
-struct io_rings {
- /*
- * Head and tail offsets into the ring; the offsets need to be
- * masked to get valid indices.
- *
- * The kernel controls head of the sq ring and the tail of the cq ring,
- * and the application controls tail of the sq ring and the head of the
- * cq ring.
- */
- struct io_uring sq, cq;
- /*
- * Bitmasks to apply to head and tail offsets (constant, equals
- * ring_entries - 1)
- */
- u32 sq_ring_mask, cq_ring_mask;
- /* Ring sizes (constant, power of 2) */
- u32 sq_ring_entries, cq_ring_entries;
- /*
- * Number of invalid entries dropped by the kernel due to
- * invalid index stored in array
- *
- * Written by the kernel, shouldn't be modified by the
- * application (i.e. get number of "new events" by comparing to
- * cached value).
- *
- * After a new SQ head value was read by the application this
- * counter includes all submissions that were dropped reaching
- * the new SQ head (and possibly more).
- */
- u32 sq_dropped;
- /*
- * Runtime SQ flags
- *
- * Written by the kernel, shouldn't be modified by the
- * application.
- *
- * The application needs a full memory barrier before checking
- * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
- */
- atomic_t sq_flags;
- /*
- * Runtime CQ flags
- *
- * Written by the application, shouldn't be modified by the
- * kernel.
- */
- u32 cq_flags;
- /*
- * Number of completion events lost because the queue was full;
- * this should be avoided by the application by making sure
- * there are not more requests pending than there is space in
- * the completion queue.
- *
- * Written by the kernel, shouldn't be modified by the
- * application (i.e. get number of "new events" by comparing to
- * cached value).
- *
- * As completion events come in out of order this counter is not
- * ordered with any other data.
- */
- u32 cq_overflow;
- /*
- * Ring buffer of completion events.
- *
- * The kernel writes completion events fresh every time they are
- * produced, so the application is allowed to modify pending
- * entries.
- */
- struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
-};
-
-struct io_mapped_ubuf {
- u64 ubuf;
- u64 ubuf_end;
- unsigned int nr_bvecs;
- unsigned long acct_pages;
- struct bio_vec bvec[];
-};
-
-struct io_ring_ctx;
-
-struct io_overflow_cqe {
- struct list_head list;
- struct io_uring_cqe cqe;
-};
-
-/*
- * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
- * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
- * can't safely always dereference the file when the task has exited and ring
- * cleanup is done. If a file is tracked and part of SCM, then unix gc on
- * process exit may reap it before __io_sqe_files_unregister() is run.
- */
-#define FFS_NOWAIT 0x1UL
-#define FFS_ISREG 0x2UL
-#if defined(CONFIG_64BIT)
-#define FFS_SCM 0x4UL
-#else
-#define IO_URING_SCM_ALL
-#define FFS_SCM 0x0UL
-#endif
-#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
-
-struct io_fixed_file {
- /* file * with additional FFS_* flags */
- unsigned long file_ptr;
-};
-
-struct io_rsrc_put {
- struct list_head list;
- u64 tag;
- union {
- void *rsrc;
- struct file *file;
- struct io_mapped_ubuf *buf;
- };
-};
-
-struct io_file_table {
- struct io_fixed_file *files;
- unsigned long *bitmap;
- unsigned int alloc_hint;
-};
-
-struct io_rsrc_node {
- struct percpu_ref refs;
- struct list_head node;
- struct list_head rsrc_list;
- struct io_rsrc_data *rsrc_data;
- struct llist_node llist;
- bool done;
-};
-
-typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-
-struct io_rsrc_data {
- struct io_ring_ctx *ctx;
-
- u64 **tags;
- unsigned int nr;
- rsrc_put_fn *do_put;
- atomic_t refs;
- struct completion done;
- bool quiesce;
-};
-
-#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
-struct io_buffer_list {
- /*
- * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
- * then these are classic provided buffers and ->buf_list is used.
- */
- union {
- struct list_head buf_list;
- struct {
- struct page **buf_pages;
- struct io_uring_buf_ring *buf_ring;
- };
- };
- __u16 bgid;
-
- /* below is for ring provided buffers */
- __u16 buf_nr_pages;
- __u16 nr_entries;
- __u16 head;
- __u16 mask;
-};
-
-struct io_buffer {
- struct list_head list;
- __u64 addr;
- __u32 len;
- __u16 bid;
- __u16 bgid;
-};
-
-struct io_restriction {
- DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
- DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
- u8 sqe_flags_allowed;
- u8 sqe_flags_required;
- bool registered;
-};
-
-enum {
- IO_SQ_THREAD_SHOULD_STOP = 0,
- IO_SQ_THREAD_SHOULD_PARK,
-};
-
-struct io_sq_data {
- refcount_t refs;
- atomic_t park_pending;
- struct mutex lock;
-
- /* ctx's that are using this sqd */
- struct list_head ctx_list;
-
- struct task_struct *thread;
- struct wait_queue_head wait;
-
- unsigned sq_thread_idle;
- int sq_cpu;
- pid_t task_pid;
- pid_t task_tgid;
-
- unsigned long state;
- struct completion exited;
-};
-
-#define IO_COMPL_BATCH 32
-#define IO_REQ_CACHE_SIZE 32
-#define IO_REQ_ALLOC_BATCH 8
-
-struct io_submit_link {
- struct io_kiocb *head;
- struct io_kiocb *last;
-};
-
-struct io_submit_state {
- /* inline/task_work completion list, under ->uring_lock */
- struct io_wq_work_node free_list;
- /* batch completion logic */
- struct io_wq_work_list compl_reqs;
- struct io_submit_link link;
-
- bool plug_started;
- bool need_plug;
- bool flush_cqes;
- unsigned short submit_nr;
- struct blk_plug plug;
-};
-
-struct io_ev_fd {
- struct eventfd_ctx *cq_ev_fd;
- unsigned int eventfd_async: 1;
- struct rcu_head rcu;
-};
-
-#define BGID_ARRAY 64
-
-struct io_ring_ctx {
- /* const or read-mostly hot data */
- struct {
- struct percpu_ref refs;
-
- struct io_rings *rings;
- unsigned int flags;
- enum task_work_notify_mode notify_method;
- unsigned int compat: 1;
- unsigned int drain_next: 1;
- unsigned int restricted: 1;
- unsigned int off_timeout_used: 1;
- unsigned int drain_active: 1;
- unsigned int drain_disabled: 1;
- unsigned int has_evfd: 1;
- unsigned int syscall_iopoll: 1;
- } ____cacheline_aligned_in_smp;
-
- /* submission data */
- struct {
- struct mutex uring_lock;
-
- /*
- * Ring buffer of indices into array of io_uring_sqe, which is
- * mmapped by the application using the IORING_OFF_SQES offset.
- *
- * This indirection could e.g. be used to assign fixed
- * io_uring_sqe entries to operations and only submit them to
- * the queue when needed.
- *
- * The kernel modifies neither the indices array nor the entries
- * array.
- */
- u32 *sq_array;
- struct io_uring_sqe *sq_sqes;
- unsigned cached_sq_head;
- unsigned sq_entries;
- struct list_head defer_list;
-
- /*
- * Fixed resources fast path, should be accessed only under
- * uring_lock, and updated through io_uring_register(2)
- */
- struct io_rsrc_node *rsrc_node;
- int rsrc_cached_refs;
- atomic_t cancel_seq;
- struct io_file_table file_table;
- unsigned nr_user_files;
- unsigned nr_user_bufs;
- struct io_mapped_ubuf **user_bufs;
-
- struct io_submit_state submit_state;
-
- struct io_buffer_list *io_bl;
- struct xarray io_bl_xa;
- struct list_head io_buffers_cache;
-
- struct list_head timeout_list;
- struct list_head ltimeout_list;
- struct list_head cq_overflow_list;
- struct list_head apoll_cache;
- struct xarray personalities;
- u32 pers_next;
- unsigned sq_thread_idle;
- } ____cacheline_aligned_in_smp;
-
- /* IRQ completion list, under ->completion_lock */
- struct io_wq_work_list locked_free_list;
- unsigned int locked_free_nr;
-
- const struct cred *sq_creds; /* cred used for __io_sq_thread() */
- struct io_sq_data *sq_data; /* if using sq thread polling */
-
- struct wait_queue_head sqo_sq_wait;
- struct list_head sqd_list;
-
- unsigned long check_cq;
-
- struct {
- /*
- * We cache a range of free CQEs we can use, once exhausted it
- * should go through a slower range setup, see __io_get_cqe()
- */
- struct io_uring_cqe *cqe_cached;
- struct io_uring_cqe *cqe_sentinel;
-
- unsigned cached_cq_tail;
- unsigned cq_entries;
- struct io_ev_fd __rcu *io_ev_fd;
- struct wait_queue_head cq_wait;
- unsigned cq_extra;
- atomic_t cq_timeouts;
- unsigned cq_last_tm_flush;
- } ____cacheline_aligned_in_smp;
-
- struct {
- spinlock_t completion_lock;
-
- spinlock_t timeout_lock;
-
- /*
- * ->iopoll_list is protected by the ctx->uring_lock for
- * io_uring instances that don't use IORING_SETUP_SQPOLL.
- * For SQPOLL, only the single threaded io_sq_thread() will
- * manipulate the list, hence no extra locking is needed there.
- */
- struct io_wq_work_list iopoll_list;
- struct hlist_head *cancel_hash;
- unsigned cancel_hash_bits;
- bool poll_multi_queue;
-
- struct list_head io_buffers_comp;
- } ____cacheline_aligned_in_smp;
-
- struct io_restriction restrictions;
-
- /* slow path rsrc auxilary data, used by update/register */
- struct {
- struct io_rsrc_node *rsrc_backup_node;
- struct io_mapped_ubuf *dummy_ubuf;
- struct io_rsrc_data *file_data;
- struct io_rsrc_data *buf_data;
-
- struct delayed_work rsrc_put_work;
- struct llist_head rsrc_put_llist;
- struct list_head rsrc_ref_list;
- spinlock_t rsrc_ref_lock;
-
- struct list_head io_buffers_pages;
- };
-
- /* Keep this last, we don't need it for the fast path */
- struct {
- #if defined(CONFIG_UNIX)
- struct socket *ring_sock;
- #endif
- /* hashed buffered write serialization */
- struct io_wq_hash *hash_map;
-
- /* Only used for accounting purposes */
- struct user_struct *user;
- struct mm_struct *mm_account;
-
- /* ctx exit and cancelation */
- struct llist_head fallback_llist;
- struct delayed_work fallback_work;
- struct work_struct exit_work;
- struct list_head tctx_list;
- struct completion ref_comp;
- u32 iowq_limits[2];
- bool iowq_limits_set;
- };
-};
-
-/*
- * Arbitrary limit, can be raised if need be
- */
-#define IO_RINGFD_REG_MAX 16
-
-struct io_uring_task {
- /* submission side */
- int cached_refs;
- struct xarray xa;
- struct wait_queue_head wait;
- const struct io_ring_ctx *last;
- struct io_wq *io_wq;
- struct percpu_counter inflight;
- atomic_t inflight_tracked;
- atomic_t in_idle;
-
- spinlock_t task_lock;
- struct io_wq_work_list task_list;
- struct io_wq_work_list prio_task_list;
- struct callback_head task_work;
- struct file **registered_rings;
- bool task_running;
-};
-
-/*
- * First field must be the file pointer in all the
- * iocb unions! See also 'struct kiocb' in <linux/fs.h>
- */
-struct io_poll_iocb {
- struct file *file;
- struct wait_queue_head *head;
- __poll_t events;
- struct wait_queue_entry wait;
-};
-
-struct io_poll_update {
- struct file *file;
- u64 old_user_data;
- u64 new_user_data;
- __poll_t events;
- bool update_events;
- bool update_user_data;
-};
-
-struct io_close {
- struct file *file;
- int fd;
- u32 file_slot;
-};
-
-struct io_timeout_data {
- struct io_kiocb *req;
- struct hrtimer timer;
- struct timespec64 ts;
- enum hrtimer_mode mode;
- u32 flags;
-};
-
-struct io_accept {
- struct file *file;
- struct sockaddr __user *addr;
- int __user *addr_len;
- int flags;
- u32 file_slot;
- unsigned long nofile;
-};
-
-struct io_socket {
- struct file *file;
- int domain;
- int type;
- int protocol;
- int flags;
- u32 file_slot;
- unsigned long nofile;
-};
-
-struct io_sync {
- struct file *file;
- loff_t len;
- loff_t off;
- int flags;
- int mode;
-};
-
-struct io_cancel {
- struct file *file;
- u64 addr;
- u32 flags;
- s32 fd;
-};
-
-struct io_timeout {
- struct file *file;
- u32 off;
- u32 target_seq;
- struct list_head list;
- /* head of the link, used by linked timeouts only */
- struct io_kiocb *head;
- /* for linked completions */
- struct io_kiocb *prev;
-};
-
-struct io_timeout_rem {
- struct file *file;
- u64 addr;
-
- /* timeout update */
- struct timespec64 ts;
- u32 flags;
- bool ltimeout;
-};
-
-struct io_rw {
- /* NOTE: kiocb has the file as the first member, so don't do it here */
- struct kiocb kiocb;
- u64 addr;
- u32 len;
- rwf_t flags;
-};
-
-struct io_connect {
- struct file *file;
- struct sockaddr __user *addr;
- int addr_len;
-};
-
-struct io_sr_msg {
- struct file *file;
- union {
- struct compat_msghdr __user *umsg_compat;
- struct user_msghdr __user *umsg;
- void __user *buf;
- };
- int msg_flags;
- size_t len;
- size_t done_io;
- unsigned int flags;
-};
-
-struct io_open {
- struct file *file;
- int dfd;
- u32 file_slot;
- struct filename *filename;
- struct open_how how;
- unsigned long nofile;
-};
-
-struct io_rsrc_update {
- struct file *file;
- u64 arg;
- u32 nr_args;
- u32 offset;
-};
-
-struct io_fadvise {
- struct file *file;
- u64 offset;
- u32 len;
- u32 advice;
-};
-
-struct io_madvise {
- struct file *file;
- u64 addr;
- u32 len;
- u32 advice;
-};
-
-struct io_epoll {
- struct file *file;
- int epfd;
- int op;
- int fd;
- struct epoll_event event;
-};
-
-struct io_splice {
- struct file *file_out;
- loff_t off_out;
- loff_t off_in;
- u64 len;
- int splice_fd_in;
- unsigned int flags;
-};
-
-struct io_provide_buf {
- struct file *file;
- __u64 addr;
- __u32 len;
- __u32 bgid;
- __u16 nbufs;
- __u16 bid;
-};
-
-struct io_statx {
- struct file *file;
- int dfd;
- unsigned int mask;
- unsigned int flags;
- struct filename *filename;
- struct statx __user *buffer;
-};
-
-struct io_shutdown {
- struct file *file;
- int how;
-};
-
-struct io_rename {
- struct file *file;
- int old_dfd;
- int new_dfd;
- struct filename *oldpath;
- struct filename *newpath;
- int flags;
-};
-
-struct io_unlink {
- struct file *file;
- int dfd;
- int flags;
- struct filename *filename;
-};
-
-struct io_mkdir {
- struct file *file;
- int dfd;
- umode_t mode;
- struct filename *filename;
-};
-
-struct io_symlink {
- struct file *file;
- int new_dfd;
- struct filename *oldpath;
- struct filename *newpath;
-};
-
-struct io_hardlink {
- struct file *file;
- int old_dfd;
- int new_dfd;
- struct filename *oldpath;
- struct filename *newpath;
- int flags;
-};
-
-struct io_msg {
- struct file *file;
- u64 user_data;
- u32 len;
-};
-
-struct io_async_connect {
- struct sockaddr_storage address;
-};
-
-struct io_async_msghdr {
- struct iovec fast_iov[UIO_FASTIOV];
- /* points to an allocated iov, if NULL we use fast_iov instead */
- struct iovec *free_iov;
- struct sockaddr __user *uaddr;
- struct msghdr msg;
- struct sockaddr_storage addr;
-};
-
-struct io_rw_state {
- struct iov_iter iter;
- struct iov_iter_state iter_state;
- struct iovec fast_iov[UIO_FASTIOV];
-};
-
-struct io_async_rw {
- struct io_rw_state s;
- const struct iovec *free_iovec;
- size_t bytes_done;
- struct wait_page_queue wpq;
-};
-
-struct io_xattr {
- struct file *file;
- struct xattr_ctx ctx;
- struct filename *filename;
-};
-
-enum {
- REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
- REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
- REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
- REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
- REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
- REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
- REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
-
- /* first byte is taken by user flags, shift it to not overlap */
- REQ_F_FAIL_BIT = 8,
- REQ_F_INFLIGHT_BIT,
- REQ_F_CUR_POS_BIT,
- REQ_F_NOWAIT_BIT,
- REQ_F_LINK_TIMEOUT_BIT,
- REQ_F_NEED_CLEANUP_BIT,
- REQ_F_POLLED_BIT,
- REQ_F_BUFFER_SELECTED_BIT,
- REQ_F_BUFFER_RING_BIT,
- REQ_F_COMPLETE_INLINE_BIT,
- REQ_F_REISSUE_BIT,
- REQ_F_CREDS_BIT,
- REQ_F_REFCOUNT_BIT,
- REQ_F_ARM_LTIMEOUT_BIT,
- REQ_F_ASYNC_DATA_BIT,
- REQ_F_SKIP_LINK_CQES_BIT,
- REQ_F_SINGLE_POLL_BIT,
- REQ_F_DOUBLE_POLL_BIT,
- REQ_F_PARTIAL_IO_BIT,
- REQ_F_CQE32_INIT_BIT,
- REQ_F_APOLL_MULTISHOT_BIT,
- /* keep async read/write and isreg together and in order */
- REQ_F_SUPPORT_NOWAIT_BIT,
- REQ_F_ISREG_BIT,
-
- /* not a real bit, just to check we're not overflowing the space */
- __REQ_F_LAST_BIT,
-};
-
-enum {
- /* ctx owns file */
- REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
- /* drain existing IO first */
- REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
- /* linked sqes */
- REQ_F_LINK = BIT(REQ_F_LINK_BIT),
- /* doesn't sever on completion < 0 */
- REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
- /* IOSQE_ASYNC */
- REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
- /* IOSQE_BUFFER_SELECT */
- REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
- /* IOSQE_CQE_SKIP_SUCCESS */
- REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
-
- /* fail rest of links */
- REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
- /* on inflight list, should be cancelled and waited on exit reliably */
- REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
- /* read/write uses file position */
- REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
- /* must not punt to workers */
- REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
- /* has or had linked timeout */
- REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
- /* needs cleanup */
- REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
- /* already went through poll handler */
- REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
- /* buffer already selected */
- REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
- /* buffer selected from ring, needs commit */
- REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
- /* completion is deferred through io_comp_state */
- REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
- /* caller should reissue async */
- REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* supports async reads/writes */
- REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
- /* regular file */
- REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* has creds assigned */
- REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
- /* skip refcounting if not set */
- REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
- /* there is a linked timeout that has to be armed */
- REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
- /* ->async_data allocated */
- REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
- /* don't post CQEs while failing linked requests */
- REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
- /* single poll may be active */
- REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
- /* double poll may active */
- REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
- /* request has already done partial IO */
- REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
- /* fast poll multishot mode */
- REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
- /* ->extra1 and ->extra2 are initialised */
- REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
-};
-
-struct async_poll {
- struct io_poll_iocb poll;
- struct io_poll_iocb *double_poll;
-};
-
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
-
-struct io_task_work {
- union {
- struct io_wq_work_node node;
- struct llist_node fallback_node;
- };
- io_req_tw_func_t func;
-};
-
-enum {
- IORING_RSRC_FILE = 0,
- IORING_RSRC_BUFFER = 1,
-};
-
-struct io_cqe {
- __u64 user_data;
- __s32 res;
- /* fd initially, then cflags for completion */
- union {
- __u32 flags;
- int fd;
- };
-};
-
-enum {
- IO_CHECK_CQ_OVERFLOW_BIT,
- IO_CHECK_CQ_DROPPED_BIT,
-};
-
-/*
- * NOTE! Each of the iocb union members has the file pointer
- * as the first entry in their struct definition. So you can
- * access the file pointer through any of the sub-structs,
- * or directly as just 'file' in this struct.
- */
-struct io_kiocb {
- union {
- struct file *file;
- struct io_rw rw;
- struct io_poll_iocb poll;
- struct io_poll_update poll_update;
- struct io_accept accept;
- struct io_sync sync;
- struct io_cancel cancel;
- struct io_timeout timeout;
- struct io_timeout_rem timeout_rem;
- struct io_connect connect;
- struct io_sr_msg sr_msg;
- struct io_open open;
- struct io_close close;
- struct io_rsrc_update rsrc_update;
- struct io_fadvise fadvise;
- struct io_madvise madvise;
- struct io_epoll epoll;
- struct io_splice splice;
- struct io_provide_buf pbuf;
- struct io_statx statx;
- struct io_shutdown shutdown;
- struct io_rename rename;
- struct io_unlink unlink;
- struct io_mkdir mkdir;
- struct io_symlink symlink;
- struct io_hardlink hardlink;
- struct io_msg msg;
- struct io_xattr xattr;
- struct io_socket sock;
- struct io_uring_cmd uring_cmd;
- };
-
- u8 opcode;
- /* polled IO has completed */
- u8 iopoll_completed;
- /*
- * Can be either a fixed buffer index, or used with provided buffers.
- * For the latter, before issue it points to the buffer group ID,
- * and after selection it points to the buffer ID itself.
- */
- u16 buf_index;
- unsigned int flags;
-
- struct io_cqe cqe;
-
- struct io_ring_ctx *ctx;
- struct task_struct *task;
-
- struct io_rsrc_node *rsrc_node;
-
- union {
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
-
- /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
- struct io_buffer *kbuf;
-
- /*
- * stores buffer ID for ring provided buffers, valid IFF
- * REQ_F_BUFFER_RING is set.
- */
- struct io_buffer_list *buf_list;
- };
-
- union {
- /* used by request caches, completion batching and iopoll */
- struct io_wq_work_node comp_list;
- /* cache ->apoll->events */
- __poll_t apoll_events;
- };
- atomic_t refs;
- atomic_t poll_refs;
- struct io_task_work io_task_work;
- /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
- union {
- struct hlist_node hash_node;
- struct {
- u64 extra1;
- u64 extra2;
- };
- };
- /* internal polling, see IORING_FEAT_FAST_POLL */
- struct async_poll *apoll;
- /* opcode allocated if it needs to store data for async defer */
- void *async_data;
- /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
- struct io_kiocb *link;
- /* custom credentials, valid IFF REQ_F_CREDS is set */
- const struct cred *creds;
- struct io_wq_work work;
-};
-
-struct io_tctx_node {
- struct list_head ctx_node;
- struct task_struct *task;
- struct io_ring_ctx *ctx;
-};
-
-struct io_defer_entry {
- struct list_head list;
- struct io_kiocb *req;
- u32 seq;
-};
-
-struct io_cancel_data {
- struct io_ring_ctx *ctx;
- union {
- u64 data;
- struct file *file;
- };
- u32 flags;
- int seq;
-};
-
-/*
- * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
- * the following sqe if SQE128 is used.
- */
-#define uring_cmd_pdu_size(is_sqe128) \
- ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
- offsetof(struct io_uring_sqe, cmd))
-
-struct io_op_def {
- /* needs req->file assigned */
- unsigned needs_file : 1;
- /* should block plug */
- unsigned plug : 1;
- /* hash wq insertion if file is a regular file */
- unsigned hash_reg_file : 1;
- /* unbound wq insertion if file is a non-regular file */
- unsigned unbound_nonreg_file : 1;
- /* set if opcode supports polled "wait" */
- unsigned pollin : 1;
- unsigned pollout : 1;
- unsigned poll_exclusive : 1;
- /* op supports buffer selection */
- unsigned buffer_select : 1;
- /* do prep async if is going to be punted */
- unsigned needs_async_setup : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
- /* skip auditing */
- unsigned audit_skip : 1;
- /* supports ioprio */
- unsigned ioprio : 1;
- /* supports iopoll */
- unsigned iopoll : 1;
- /* size of async data needed, if any */
- unsigned short async_size;
-};
-
-static const struct io_op_def io_op_defs[] = {
- [IORING_OP_NOP] = {
- .audit_skip = 1,
- .iopoll = 1,
- },
- [IORING_OP_READV] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .buffer_select = 1,
- .needs_async_setup = 1,
- .plug = 1,
- .audit_skip = 1,
- .ioprio = 1,
- .iopoll = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_WRITEV] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .needs_async_setup = 1,
- .plug = 1,
- .audit_skip = 1,
- .ioprio = 1,
- .iopoll = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_FSYNC] = {
- .needs_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_READ_FIXED] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .plug = 1,
- .audit_skip = 1,
- .ioprio = 1,
- .iopoll = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_WRITE_FIXED] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .plug = 1,
- .audit_skip = 1,
- .ioprio = 1,
- .iopoll = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_POLL_ADD] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_POLL_REMOVE] = {
- .audit_skip = 1,
- },
- [IORING_OP_SYNC_FILE_RANGE] = {
- .needs_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_SENDMSG] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .needs_async_setup = 1,
- .async_size = sizeof(struct io_async_msghdr),
- },
- [IORING_OP_RECVMSG] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .buffer_select = 1,
- .needs_async_setup = 1,
- .async_size = sizeof(struct io_async_msghdr),
- },
- [IORING_OP_TIMEOUT] = {
- .audit_skip = 1,
- .async_size = sizeof(struct io_timeout_data),
- },
- [IORING_OP_TIMEOUT_REMOVE] = {
- /* used by timeout updates' prep() */
- .audit_skip = 1,
- },
- [IORING_OP_ACCEPT] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .poll_exclusive = 1,
- .ioprio = 1, /* used for flags */
- },
- [IORING_OP_ASYNC_CANCEL] = {
- .audit_skip = 1,
- },
- [IORING_OP_LINK_TIMEOUT] = {
- .audit_skip = 1,
- .async_size = sizeof(struct io_timeout_data),
- },
- [IORING_OP_CONNECT] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .needs_async_setup = 1,
- .async_size = sizeof(struct io_async_connect),
- },
- [IORING_OP_FALLOCATE] = {
- .needs_file = 1,
- },
- [IORING_OP_OPENAT] = {},
- [IORING_OP_CLOSE] = {},
- [IORING_OP_FILES_UPDATE] = {
- .audit_skip = 1,
- .iopoll = 1,
- },
- [IORING_OP_STATX] = {
- .audit_skip = 1,
- },
- [IORING_OP_READ] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .buffer_select = 1,
- .plug = 1,
- .audit_skip = 1,
- .ioprio = 1,
- .iopoll = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_WRITE] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .plug = 1,
- .audit_skip = 1,
- .ioprio = 1,
- .iopoll = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_FADVISE] = {
- .needs_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_MADVISE] = {},
- [IORING_OP_SEND] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .audit_skip = 1,
- },
- [IORING_OP_RECV] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .buffer_select = 1,
- .audit_skip = 1,
- },
- [IORING_OP_OPENAT2] = {
- },
- [IORING_OP_EPOLL_CTL] = {
- .unbound_nonreg_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_SPLICE] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_PROVIDE_BUFFERS] = {
- .audit_skip = 1,
- .iopoll = 1,
- },
- [IORING_OP_REMOVE_BUFFERS] = {
- .audit_skip = 1,
- .iopoll = 1,
- },
- [IORING_OP_TEE] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_SHUTDOWN] = {
- .needs_file = 1,
- },
- [IORING_OP_RENAMEAT] = {},
- [IORING_OP_UNLINKAT] = {},
- [IORING_OP_MKDIRAT] = {},
- [IORING_OP_SYMLINKAT] = {},
- [IORING_OP_LINKAT] = {},
- [IORING_OP_MSG_RING] = {
- .needs_file = 1,
- .iopoll = 1,
- },
- [IORING_OP_FSETXATTR] = {
- .needs_file = 1
- },
- [IORING_OP_SETXATTR] = {},
- [IORING_OP_FGETXATTR] = {
- .needs_file = 1
- },
- [IORING_OP_GETXATTR] = {},
- [IORING_OP_SOCKET] = {
- .audit_skip = 1,
- },
- [IORING_OP_URING_CMD] = {
- .needs_file = 1,
- .plug = 1,
- .needs_async_setup = 1,
- .async_size = uring_cmd_pdu_size(1),
- },
-};
-
-/* requests with any of those set should undergo io_disarm_next() */
-#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
-#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
-
-static bool io_disarm_next(struct io_kiocb *req);
-static void io_uring_del_tctx_node(unsigned long index);
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all);
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-
-static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
-static void io_dismantle_req(struct io_kiocb *req);
-static void io_queue_linked_timeout(struct io_kiocb *req);
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
- struct io_uring_rsrc_update2 *up,
- unsigned nr_args);
-static void io_clean_op(struct io_kiocb *req);
-static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
- unsigned issue_flags);
-static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
-static void io_queue_sqe(struct io_kiocb *req);
-static void io_rsrc_put_work(struct work_struct *work);
-
-static void io_req_task_queue(struct io_kiocb *req);
-static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
-static int io_req_prep_async(struct io_kiocb *req);
-
-static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
- unsigned int issue_flags, u32 slot_index);
-static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
- unsigned int offset);
-static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
-
-static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
-static void io_eventfd_signal(struct io_ring_ctx *ctx);
-static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
-
-static struct kmem_cache *req_cachep;
-
-static const struct file_operations io_uring_fops;
-
-const char *io_uring_get_opcode(u8 opcode)
-{
- switch ((enum io_uring_op)opcode) {
- case IORING_OP_NOP:
- return "NOP";
- case IORING_OP_READV:
- return "READV";
- case IORING_OP_WRITEV:
- return "WRITEV";
- case IORING_OP_FSYNC:
- return "FSYNC";
- case IORING_OP_READ_FIXED:
- return "READ_FIXED";
- case IORING_OP_WRITE_FIXED:
- return "WRITE_FIXED";
- case IORING_OP_POLL_ADD:
- return "POLL_ADD";
- case IORING_OP_POLL_REMOVE:
- return "POLL_REMOVE";
- case IORING_OP_SYNC_FILE_RANGE:
- return "SYNC_FILE_RANGE";
- case IORING_OP_SENDMSG:
- return "SENDMSG";
- case IORING_OP_RECVMSG:
- return "RECVMSG";
- case IORING_OP_TIMEOUT:
- return "TIMEOUT";
- case IORING_OP_TIMEOUT_REMOVE:
- return "TIMEOUT_REMOVE";
- case IORING_OP_ACCEPT:
- return "ACCEPT";
- case IORING_OP_ASYNC_CANCEL:
- return "ASYNC_CANCEL";
- case IORING_OP_LINK_TIMEOUT:
- return "LINK_TIMEOUT";
- case IORING_OP_CONNECT:
- return "CONNECT";
- case IORING_OP_FALLOCATE:
- return "FALLOCATE";
- case IORING_OP_OPENAT:
- return "OPENAT";
- case IORING_OP_CLOSE:
- return "CLOSE";
- case IORING_OP_FILES_UPDATE:
- return "FILES_UPDATE";
- case IORING_OP_STATX:
- return "STATX";
- case IORING_OP_READ:
- return "READ";
- case IORING_OP_WRITE:
- return "WRITE";
- case IORING_OP_FADVISE:
- return "FADVISE";
- case IORING_OP_MADVISE:
- return "MADVISE";
- case IORING_OP_SEND:
- return "SEND";
- case IORING_OP_RECV:
- return "RECV";
- case IORING_OP_OPENAT2:
- return "OPENAT2";
- case IORING_OP_EPOLL_CTL:
- return "EPOLL_CTL";
- case IORING_OP_SPLICE:
- return "SPLICE";
- case IORING_OP_PROVIDE_BUFFERS:
- return "PROVIDE_BUFFERS";
- case IORING_OP_REMOVE_BUFFERS:
- return "REMOVE_BUFFERS";
- case IORING_OP_TEE:
- return "TEE";
- case IORING_OP_SHUTDOWN:
- return "SHUTDOWN";
- case IORING_OP_RENAMEAT:
- return "RENAMEAT";
- case IORING_OP_UNLINKAT:
- return "UNLINKAT";
- case IORING_OP_MKDIRAT:
- return "MKDIRAT";
- case IORING_OP_SYMLINKAT:
- return "SYMLINKAT";
- case IORING_OP_LINKAT:
- return "LINKAT";
- case IORING_OP_MSG_RING:
- return "MSG_RING";
- case IORING_OP_FSETXATTR:
- return "FSETXATTR";
- case IORING_OP_SETXATTR:
- return "SETXATTR";
- case IORING_OP_FGETXATTR:
- return "FGETXATTR";
- case IORING_OP_GETXATTR:
- return "GETXATTR";
- case IORING_OP_SOCKET:
- return "SOCKET";
- case IORING_OP_URING_CMD:
- return "URING_CMD";
- case IORING_OP_LAST:
- return "INVALID";
- }
- return "INVALID";
-}
-
-struct sock *io_uring_get_socket(struct file *file)
-{
-#if defined(CONFIG_UNIX)
- if (file->f_op == &io_uring_fops) {
- struct io_ring_ctx *ctx = file->private_data;
-
- return ctx->ring_sock->sk;
- }
-#endif
- return NULL;
-}
-EXPORT_SYMBOL(io_uring_get_socket);
-
-#if defined(CONFIG_UNIX)
-static inline bool io_file_need_scm(struct file *filp)
-{
-#if defined(IO_URING_SCM_ALL)
- return true;
-#else
- return !!unix_get_socket(filp);
-#endif
-}
-#else
-static inline bool io_file_need_scm(struct file *filp)
-{
- return false;
-}
-#endif
-
-static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
-{
- lockdep_assert_held(&ctx->uring_lock);
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_unlock(&ctx->uring_lock);
-}
-
-static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
-{
- /*
- * "Normal" inline submissions always hold the uring_lock, since we
- * grab it from the system call. Same is true for the SQPOLL offload.
- * The only exception is when we've detached the request and issue it
- * from an async worker thread, grab the lock for that case.
- */
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_lock(&ctx->uring_lock);
- lockdep_assert_held(&ctx->uring_lock);
-}
-
-static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
-{
- if (!*locked) {
- mutex_lock(&ctx->uring_lock);
- *locked = true;
- }
-}
-
-#define io_for_each_link(pos, head) \
- for (pos = (head); pos; pos = pos->link)
-
-/*
- * Shamelessly stolen from the mm implementation of page reference checking,
- * see commit f958d7b528b1 for details.
- */
-#define req_ref_zero_or_close_to_overflow(req) \
- ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
-
-static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
-{
- WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
- return atomic_inc_not_zero(&req->refs);
-}
-
-static inline bool req_ref_put_and_test(struct io_kiocb *req)
-{
- if (likely(!(req->flags & REQ_F_REFCOUNT)))
- return true;
-
- WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
- return atomic_dec_and_test(&req->refs);
-}
-
-static inline void req_ref_get(struct io_kiocb *req)
-{
- WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
- WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
- atomic_inc(&req->refs);
-}
-
-static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
-{
- if (!wq_list_empty(&ctx->submit_state.compl_reqs))
- __io_submit_flush_completions(ctx);
-}
-
-static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
-{
- if (!(req->flags & REQ_F_REFCOUNT)) {
- req->flags |= REQ_F_REFCOUNT;
- atomic_set(&req->refs, nr);
- }
-}
-
-static inline void io_req_set_refcount(struct io_kiocb *req)
-{
- __io_req_set_refcount(req, 1);
-}
-
-#define IO_RSRC_REF_BATCH 100
-
-static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
-{
- percpu_ref_put_many(&node->refs, nr);
-}
-
-static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
- struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- struct io_rsrc_node *node = req->rsrc_node;
-
- if (node) {
- if (node == ctx->rsrc_node)
- ctx->rsrc_cached_refs++;
- else
- io_rsrc_put_node(node, 1);
- }
-}
-
-static inline void io_req_put_rsrc(struct io_kiocb *req)
-{
- if (req->rsrc_node)
- io_rsrc_put_node(req->rsrc_node, 1);
-}
-
-static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- if (ctx->rsrc_cached_refs) {
- io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
- ctx->rsrc_cached_refs = 0;
- }
-}
-
-static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
- percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
-}
-
-static inline void io_req_set_rsrc_node(struct io_kiocb *req,
- struct io_ring_ctx *ctx,
- unsigned int issue_flags)
-{
- if (!req->rsrc_node) {
- req->rsrc_node = ctx->rsrc_node;
-
- if (!(issue_flags & IO_URING_F_UNLOCKED)) {
- lockdep_assert_held(&ctx->uring_lock);
- ctx->rsrc_cached_refs--;
- if (unlikely(ctx->rsrc_cached_refs < 0))
- io_rsrc_refs_refill(ctx);
- } else {
- percpu_ref_get(&req->rsrc_node->refs);
- }
- }
-}
-
-static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
-{
- if (req->flags & REQ_F_BUFFER_RING) {
- if (req->buf_list)
- req->buf_list->head++;
- req->flags &= ~REQ_F_BUFFER_RING;
- } else {
- list_add(&req->kbuf->list, list);
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- }
-
- return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
-}
-
-static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
-{
- lockdep_assert_held(&req->ctx->completion_lock);
-
- if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
- return 0;
- return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
-}
-
-static inline unsigned int io_put_kbuf(struct io_kiocb *req,
- unsigned issue_flags)
-{
- unsigned int cflags;
-
- if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
- return 0;
-
- /*
- * We can add this buffer back to two lists:
- *
- * 1) The io_buffers_cache list. This one is protected by the
- * ctx->uring_lock. If we already hold this lock, add back to this
- * list as we can grab it from issue as well.
- * 2) The io_buffers_comp list. This one is protected by the
- * ctx->completion_lock.
- *
- * We migrate buffers from the comp_list to the issue cache list
- * when we need one.
- */
- if (req->flags & REQ_F_BUFFER_RING) {
- /* no buffers to recycle for this case */
- cflags = __io_put_kbuf(req, NULL);
- } else if (issue_flags & IO_URING_F_UNLOCKED) {
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock(&ctx->completion_lock);
- cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
- spin_unlock(&ctx->completion_lock);
- } else {
- lockdep_assert_held(&req->ctx->uring_lock);
-
- cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
- }
-
- return cflags;
-}
-
-static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
- unsigned int bgid)
-{
- if (ctx->io_bl && bgid < BGID_ARRAY)
- return &ctx->io_bl[bgid];
-
- return xa_load(&ctx->io_bl_xa, bgid);
-}
-
-static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- struct io_buffer *buf;
-
- if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
- return;
- /*
- * For legacy provided buffer mode, don't recycle if we already did
- * IO to this buffer. For ring-mapped provided buffer mode, we should
- * increment ring->head to explicitly monopolize the buffer to avoid
- * multiple use.
- */
- if ((req->flags & REQ_F_BUFFER_SELECTED) &&
- (req->flags & REQ_F_PARTIAL_IO))
- return;
-
- /*
- * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
- * the flag and hence ensure that bl->head doesn't get incremented.
- * If the tail has already been incremented, hang on to it.
- */
- if (req->flags & REQ_F_BUFFER_RING) {
- if (req->buf_list) {
- if (req->flags & REQ_F_PARTIAL_IO) {
- req->buf_list->head++;
- req->buf_list = NULL;
- } else {
- req->buf_index = req->buf_list->bgid;
- req->flags &= ~REQ_F_BUFFER_RING;
- }
- }
- return;
- }
-
- io_ring_submit_lock(ctx, issue_flags);
-
- buf = req->kbuf;
- bl = io_buffer_get_list(ctx, buf->bgid);
- list_add(&buf->list, &bl->buf_list);
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->buf_index = buf->bgid;
-
- io_ring_submit_unlock(ctx, issue_flags);
-}
-
-static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
- bool cancel_all)
- __must_hold(&req->ctx->timeout_lock)
-{
- struct io_kiocb *req;
-
- if (task && head->task != task)
- return false;
- if (cancel_all)
- return true;
-
- io_for_each_link(req, head) {
- if (req->flags & REQ_F_INFLIGHT)
- return true;
- }
- return false;
-}
-
-static bool io_match_linked(struct io_kiocb *head)
-{
- struct io_kiocb *req;
-
- io_for_each_link(req, head) {
- if (req->flags & REQ_F_INFLIGHT)
- return true;
- }
- return false;
-}
-
-/*
- * As io_match_task() but protected against racing with linked timeouts.
- * User must not hold timeout_lock.
- */
-static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
- bool cancel_all)
-{
- bool matched;
-
- if (task && head->task != task)
- return false;
- if (cancel_all)
- return true;
-
- if (head->flags & REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = head->ctx;
-
- /* protect against races with linked timeouts */
- spin_lock_irq(&ctx->timeout_lock);
- matched = io_match_linked(head);
- spin_unlock_irq(&ctx->timeout_lock);
- } else {
- matched = io_match_linked(head);
- }
- return matched;
-}
-
-static inline bool req_has_async_data(struct io_kiocb *req)
-{
- return req->flags & REQ_F_ASYNC_DATA;
-}
-
-static inline void req_set_fail(struct io_kiocb *req)
-{
- req->flags |= REQ_F_FAIL;
- if (req->flags & REQ_F_CQE_SKIP) {
- req->flags &= ~REQ_F_CQE_SKIP;
- req->flags |= REQ_F_SKIP_LINK_CQES;
- }
-}
-
-static inline void req_fail_link_node(struct io_kiocb *req, int res)
-{
- req_set_fail(req);
- req->cqe.res = res;
-}
-
-static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
- wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
-}
-
-static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
-{
- struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
-
- complete(&ctx->ref_comp);
-}
-
-static inline bool io_is_timeout_noseq(struct io_kiocb *req)
-{
- return !req->timeout.off;
-}
-
-static __cold void io_fallback_req_func(struct work_struct *work)
-{
- struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
- fallback_work.work);
- struct llist_node *node = llist_del_all(&ctx->fallback_llist);
- struct io_kiocb *req, *tmp;
- bool locked = false;
-
- percpu_ref_get(&ctx->refs);
- llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
- req->io_task_work.func(req, &locked);
-
- if (locked) {
- io_submit_flush_completions(ctx);
- mutex_unlock(&ctx->uring_lock);
- }
- percpu_ref_put(&ctx->refs);
-}
-
-static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
-{
- struct io_ring_ctx *ctx;
- int hash_bits;
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return NULL;
-
- xa_init(&ctx->io_bl_xa);
-
- /*
- * Use 5 bits less than the max cq entries, that should give us around
- * 32 entries per hash list if totally full and uniformly spread.
- */
- hash_bits = ilog2(p->cq_entries);
- hash_bits -= 5;
- if (hash_bits <= 0)
- hash_bits = 1;
- ctx->cancel_hash_bits = hash_bits;
- ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!ctx->cancel_hash)
- goto err;
- __hash_init(ctx->cancel_hash, 1U << hash_bits);
-
- ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
- if (!ctx->dummy_ubuf)
- goto err;
- /* set invalid range, so io_import_fixed() fails meeting it */
- ctx->dummy_ubuf->ubuf = -1UL;
-
- if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
- goto err;
-
- ctx->flags = p->flags;
- init_waitqueue_head(&ctx->sqo_sq_wait);
- INIT_LIST_HEAD(&ctx->sqd_list);
- INIT_LIST_HEAD(&ctx->cq_overflow_list);
- INIT_LIST_HEAD(&ctx->io_buffers_cache);
- INIT_LIST_HEAD(&ctx->apoll_cache);
- init_completion(&ctx->ref_comp);
- xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
- mutex_init(&ctx->uring_lock);
- init_waitqueue_head(&ctx->cq_wait);
- spin_lock_init(&ctx->completion_lock);
- spin_lock_init(&ctx->timeout_lock);
- INIT_WQ_LIST(&ctx->iopoll_list);
- INIT_LIST_HEAD(&ctx->io_buffers_pages);
- INIT_LIST_HEAD(&ctx->io_buffers_comp);
- INIT_LIST_HEAD(&ctx->defer_list);
- INIT_LIST_HEAD(&ctx->timeout_list);
- INIT_LIST_HEAD(&ctx->ltimeout_list);
- spin_lock_init(&ctx->rsrc_ref_lock);
- INIT_LIST_HEAD(&ctx->rsrc_ref_list);
- INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
- init_llist_head(&ctx->rsrc_put_llist);
- INIT_LIST_HEAD(&ctx->tctx_list);
- ctx->submit_state.free_list.next = NULL;
- INIT_WQ_LIST(&ctx->locked_free_list);
- INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
- INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
- return ctx;
-err:
- kfree(ctx->dummy_ubuf);
- kfree(ctx->cancel_hash);
- kfree(ctx->io_bl);
- xa_destroy(&ctx->io_bl_xa);
- kfree(ctx);
- return NULL;
-}
-
-static void io_account_cq_overflow(struct io_ring_ctx *ctx)
-{
- struct io_rings *r = ctx->rings;
-
- WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
- ctx->cq_extra--;
-}
-
-static bool req_need_defer(struct io_kiocb *req, u32 seq)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
- }
-
- return false;
-}
-
-static inline bool io_req_ffs_set(struct io_kiocb *req)
-{
- return req->flags & REQ_F_FIXED_FILE;
-}
-
-static inline void io_req_track_inflight(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_INFLIGHT)) {
- req->flags |= REQ_F_INFLIGHT;
- atomic_inc(&req->task->io_uring->inflight_tracked);
- }
-}
-
-static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
-{
- if (WARN_ON_ONCE(!req->link))
- return NULL;
-
- req->flags &= ~REQ_F_ARM_LTIMEOUT;
- req->flags |= REQ_F_LINK_TIMEOUT;
-
- /* linked timeouts should have two refs once prep'ed */
- io_req_set_refcount(req);
- __io_req_set_refcount(req->link, 2);
- return req->link;
-}
-
-static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
-{
- if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
- return NULL;
- return __io_prep_linked_timeout(req);
-}
-
-static noinline void __io_arm_ltimeout(struct io_kiocb *req)
-{
- io_queue_linked_timeout(__io_prep_linked_timeout(req));
-}
-
-static inline void io_arm_ltimeout(struct io_kiocb *req)
-{
- if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
- __io_arm_ltimeout(req);
-}
-
-static void io_prep_async_work(struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!(req->flags & REQ_F_CREDS)) {
- req->flags |= REQ_F_CREDS;
- req->creds = get_current_cred();
- }
-
- req->work.list.next = NULL;
- req->work.flags = 0;
- req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
- if (req->flags & REQ_F_FORCE_ASYNC)
- req->work.flags |= IO_WQ_WORK_CONCURRENT;
-
- if (req->flags & REQ_F_ISREG) {
- if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
- io_wq_hash_work(&req->work, file_inode(req->file));
- } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
- if (def->unbound_nonreg_file)
- req->work.flags |= IO_WQ_WORK_UNBOUND;
- }
-}
-
-static void io_prep_async_link(struct io_kiocb *req)
-{
- struct io_kiocb *cur;
-
- if (req->flags & REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock_irq(&ctx->timeout_lock);
- io_for_each_link(cur, req)
- io_prep_async_work(cur);
- spin_unlock_irq(&ctx->timeout_lock);
- } else {
- io_for_each_link(cur, req)
- io_prep_async_work(cur);
- }
-}
-
-static inline void io_req_add_compl_list(struct io_kiocb *req)
-{
- struct io_submit_state *state = &req->ctx->submit_state;
-
- if (!(req->flags & REQ_F_CQE_SKIP))
- state->flush_cqes = true;
- wq_list_add_tail(&req->comp_list, &state->compl_reqs);
-}
-
-static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
-{
- struct io_kiocb *link = io_prep_linked_timeout(req);
- struct io_uring_task *tctx = req->task->io_uring;
-
- BUG_ON(!tctx);
- BUG_ON(!tctx->io_wq);
-
- /* init ->work of the whole link before punting */
- io_prep_async_link(req);
-
- /*
- * Not expected to happen, but if we do have a bug where this _can_
- * happen, catch it here and ensure the request is marked as
- * canceled. That will make io-wq go through the usual work cancel
- * procedure rather than attempt to run this request (or create a new
- * worker for it).
- */
- if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
- req->work.flags |= IO_WQ_WORK_CANCEL;
-
- trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
- req->opcode, req->flags, &req->work,
- io_wq_is_hashed(&req->work));
- io_wq_enqueue(tctx->io_wq, &req->work);
- if (link)
- io_queue_linked_timeout(link);
-}
-
-static void io_kill_timeout(struct io_kiocb *req, int status)
- __must_hold(&req->ctx->completion_lock)
- __must_hold(&req->ctx->timeout_lock)
-{
- struct io_timeout_data *io = req->async_data;
-
- if (hrtimer_try_to_cancel(&io->timer) != -1) {
- if (status)
- req_set_fail(req);
- atomic_set(&req->ctx->cq_timeouts,
- atomic_read(&req->ctx->cq_timeouts) + 1);
- list_del_init(&req->timeout.list);
- io_req_tw_post_queue(req, status, 0);
- }
-}
-
-static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
-{
- while (!list_empty(&ctx->defer_list)) {
- struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
- struct io_defer_entry, list);
-
- if (req_need_defer(de->req, de->seq))
- break;
- list_del_init(&de->list);
- io_req_task_queue(de->req);
- kfree(de);
- }
-}
-
-static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
- __must_hold(&ctx->completion_lock)
-{
- u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
- struct io_kiocb *req, *tmp;
-
- spin_lock_irq(&ctx->timeout_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- u32 events_needed, events_got;
-
- if (io_is_timeout_noseq(req))
- break;
-
- /*
- * Since seq can easily wrap around over time, subtract
- * the last seq at which timeouts were flushed before comparing.
- * Assuming not more than 2^31-1 events have happened since,
- * these subtractions won't have wrapped, so we can check if
- * target is in [last_seq, current_seq] by comparing the two.
- */
- events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
- events_got = seq - ctx->cq_last_tm_flush;
- if (events_got < events_needed)
- break;
-
- io_kill_timeout(req, 0);
- }
- ctx->cq_last_tm_flush = seq;
- spin_unlock_irq(&ctx->timeout_lock);
-}
-
-static inline void io_commit_cqring(struct io_ring_ctx *ctx)
-{
- /* order cqe stores with ring update */
- smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
-}
-
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
- if (ctx->off_timeout_used || ctx->drain_active) {
- spin_lock(&ctx->completion_lock);
- if (ctx->off_timeout_used)
- io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- }
- if (ctx->has_evfd)
- io_eventfd_signal(ctx);
-}
-
-static inline bool io_sqring_full(struct io_ring_ctx *ctx)
-{
- struct io_rings *r = ctx->rings;
-
- return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
-}
-
-static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
-{
- return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
-}
-
-/*
- * writes to the cq entry need to come after reading head; the
- * control dependency is enough as we're using WRITE_ONCE to
- * fill the cq entry
- */
-static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
-{
- struct io_rings *rings = ctx->rings;
- unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
- unsigned int shift = 0;
- unsigned int free, queued, len;
-
- if (ctx->flags & IORING_SETUP_CQE32)
- shift = 1;
-
- /* userspace may cheat modifying the tail, be safe and do min */
- queued = min(__io_cqring_events(ctx), ctx->cq_entries);
- free = ctx->cq_entries - queued;
- /* we need a contiguous range, limit based on the current array offset */
- len = min(free, ctx->cq_entries - off);
- if (!len)
- return NULL;
-
- ctx->cached_cq_tail++;
- ctx->cqe_cached = &rings->cqes[off];
- ctx->cqe_sentinel = ctx->cqe_cached + len;
- ctx->cqe_cached++;
- return &rings->cqes[off << shift];
-}
-
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
-{
- if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
- struct io_uring_cqe *cqe = ctx->cqe_cached;
-
- if (ctx->flags & IORING_SETUP_CQE32) {
- unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
-
- cqe += off;
- }
-
- ctx->cached_cq_tail++;
- ctx->cqe_cached++;
- return cqe;
- }
-
- return __io_get_cqe(ctx);
-}
-
-static void io_eventfd_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- rcu_read_lock();
- /*
- * rcu_dereference ctx->io_ev_fd once and use it for both for checking
- * and eventfd_signal
- */
- ev_fd = rcu_dereference(ctx->io_ev_fd);
-
- /*
- * Check again if ev_fd exists incase an io_eventfd_unregister call
- * completed between the NULL check of ctx->io_ev_fd at the start of
- * the function and rcu_read_lock.
- */
- if (unlikely(!ev_fd))
- goto out;
- if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- goto out;
-
- if (!ev_fd->eventfd_async || io_wq_current_is_worker())
- eventfd_signal(ev_fd->cq_ev_fd, 1);
-out:
- rcu_read_unlock();
-}
-
-static inline void io_cqring_wake(struct io_ring_ctx *ctx)
-{
- /*
- * wake_up_all() may seem excessive, but io_wake_function() and
- * io_should_wake() handle the termination of the loop and only
- * wake as many waiters as we need to.
- */
- if (wq_has_sleeper(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
-}
-
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
-static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
- ctx->has_evfd))
- __io_commit_cqring_flush(ctx);
-
- io_cqring_wake(ctx);
-}
-
-static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
-{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
- ctx->has_evfd))
- __io_commit_cqring_flush(ctx);
-
- if (ctx->flags & IORING_SETUP_SQPOLL)
- io_cqring_wake(ctx);
-}
-
-/* Returns true if there are no backlogged entries after the flush */
-static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
-{
- bool all_flushed, posted;
- size_t cqe_size = sizeof(struct io_uring_cqe);
-
- if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
- return false;
-
- if (ctx->flags & IORING_SETUP_CQE32)
- cqe_size <<= 1;
-
- posted = false;
- spin_lock(&ctx->completion_lock);
- while (!list_empty(&ctx->cq_overflow_list)) {
- struct io_uring_cqe *cqe = io_get_cqe(ctx);
- struct io_overflow_cqe *ocqe;
-
- if (!cqe && !force)
- break;
- ocqe = list_first_entry(&ctx->cq_overflow_list,
- struct io_overflow_cqe, list);
- if (cqe)
- memcpy(cqe, &ocqe->cqe, cqe_size);
- else
- io_account_cq_overflow(ctx);
-
- posted = true;
- list_del(&ocqe->list);
- kfree(ocqe);
- }
-
- all_flushed = list_empty(&ctx->cq_overflow_list);
- if (all_flushed) {
- clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
- atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
- }
-
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
- return all_flushed;
-}
-
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
-{
- bool ret = true;
-
- if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
- /* iopoll syncs against uring_lock, not completion_lock */
- if (ctx->flags & IORING_SETUP_IOPOLL)
- mutex_lock(&ctx->uring_lock);
- ret = __io_cqring_overflow_flush(ctx, false);
- if (ctx->flags & IORING_SETUP_IOPOLL)
- mutex_unlock(&ctx->uring_lock);
- }
-
- return ret;
-}
-
-static void __io_put_task(struct task_struct *task, int nr)
-{
- struct io_uring_task *tctx = task->io_uring;
-
- percpu_counter_sub(&tctx->inflight, nr);
- if (unlikely(atomic_read(&tctx->in_idle)))
- wake_up(&tctx->wait);
- put_task_struct_many(task, nr);
-}
-
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
-{
- if (likely(task == current))
- task->io_uring->cached_refs += nr;
- else
- __io_put_task(task, nr);
-}
-
-static void io_task_refs_refill(struct io_uring_task *tctx)
-{
- unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
-
- percpu_counter_add(&tctx->inflight, refill);
- refcount_add(refill, &current->usage);
- tctx->cached_refs += refill;
-}
-
-static inline void io_get_task_refs(int nr)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- tctx->cached_refs -= nr;
- if (unlikely(tctx->cached_refs < 0))
- io_task_refs_refill(tctx);
-}
-
-static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
-{
- struct io_uring_task *tctx = task->io_uring;
- unsigned int refs = tctx->cached_refs;
-
- if (refs) {
- tctx->cached_refs = 0;
- percpu_counter_sub(&tctx->inflight, refs);
- put_task_struct_many(task, refs);
- }
-}
-
-static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags, u64 extra1,
- u64 extra2)
-{
- struct io_overflow_cqe *ocqe;
- size_t ocq_size = sizeof(struct io_overflow_cqe);
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
-
- if (is_cqe32)
- ocq_size += sizeof(struct io_uring_cqe);
-
- ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
- trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
- if (!ocqe) {
- /*
- * If we're in ring overflow flush mode, or in task cancel mode,
- * or cannot allocate an overflow entry, then we need to drop it
- * on the floor.
- */
- io_account_cq_overflow(ctx);
- set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
- return false;
- }
- if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
- atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
-
- }
- ocqe->cqe.user_data = user_data;
- ocqe->cqe.res = res;
- ocqe->cqe.flags = cflags;
- if (is_cqe32) {
- ocqe->cqe.big_cqe[0] = extra1;
- ocqe->cqe.big_cqe[1] = extra2;
- }
- list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
- return true;
-}
-
-static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_uring_cqe *cqe;
-
- if (!(ctx->flags & IORING_SETUP_CQE32)) {
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
- req->cqe.res, req->cqe.flags, 0, 0);
-
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- memcpy(cqe, &req->cqe, sizeof(*cqe));
- return true;
- }
-
- return io_cqring_event_overflow(ctx, req->cqe.user_data,
- req->cqe.res, req->cqe.flags,
- 0, 0);
- } else {
- u64 extra1 = 0, extra2 = 0;
-
- if (req->flags & REQ_F_CQE32_INIT) {
- extra1 = req->extra1;
- extra2 = req->extra2;
- }
-
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
- req->cqe.res, req->cqe.flags, extra1, extra2);
-
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
- WRITE_ONCE(cqe->big_cqe[0], extra1);
- WRITE_ONCE(cqe->big_cqe[1], extra2);
- return true;
- }
-
- return io_cqring_event_overflow(ctx, req->cqe.user_data,
- req->cqe.res, req->cqe.flags,
- extra1, extra2);
- }
-}
-
-static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
-{
- struct io_uring_cqe *cqe;
-
- ctx->cq_extra++;
- trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
-
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- WRITE_ONCE(cqe->user_data, user_data);
- WRITE_ONCE(cqe->res, res);
- WRITE_ONCE(cqe->flags, cflags);
-
- if (ctx->flags & IORING_SETUP_CQE32) {
- WRITE_ONCE(cqe->big_cqe[0], 0);
- WRITE_ONCE(cqe->big_cqe[1], 0);
- }
- return true;
- }
- return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
-}
-
-static void __io_req_complete_put(struct io_kiocb *req)
-{
- /*
- * If we're the last reference to this request, add to our locked
- * free_list cache.
- */
- if (req_ref_put_and_test(req)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- if (req->flags & IO_REQ_LINK_FLAGS) {
- if (req->flags & IO_DISARM_MASK)
- io_disarm_next(req);
- if (req->link) {
- io_req_task_queue(req->link);
- req->link = NULL;
- }
- }
- io_req_put_rsrc(req);
- /*
- * Selected buffer deallocation in io_clean_op() assumes that
- * we don't hold ->completion_lock. Clean them here to avoid
- * deadlocks.
- */
- io_put_kbuf_comp(req);
- io_dismantle_req(req);
- io_put_task(req->task, 1);
- wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
- ctx->locked_free_nr++;
- }
-}
-
-static void __io_req_complete_post(struct io_kiocb *req, s32 res,
- u32 cflags)
-{
- if (!(req->flags & REQ_F_CQE_SKIP)) {
- req->cqe.res = res;
- req->cqe.flags = cflags;
- __io_fill_cqe_req(req->ctx, req);
- }
- __io_req_complete_put(req);
-}
-
-static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock(&ctx->completion_lock);
- __io_req_complete_post(req, res, cflags);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-}
-
-static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
- u32 cflags)
-{
- req->cqe.res = res;
- req->cqe.flags = cflags;
- req->flags |= REQ_F_COMPLETE_INLINE;
-}
-
-static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
- s32 res, u32 cflags)
-{
- if (issue_flags & IO_URING_F_COMPLETE_DEFER)
- io_req_complete_state(req, res, cflags);
- else
- io_req_complete_post(req, res, cflags);
-}
-
-static inline void io_req_complete(struct io_kiocb *req, s32 res)
-{
- if (res < 0)
- req_set_fail(req);
- __io_req_complete(req, 0, res, 0);
-}
-
-static void io_req_complete_failed(struct io_kiocb *req, s32 res)
-{
- req_set_fail(req);
- io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
-}
-
-/*
- * Don't initialise the fields below on every allocation, but do that in
- * advance and keep them valid across allocations.
- */
-static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
- req->ctx = ctx;
- req->link = NULL;
- req->async_data = NULL;
- /* not necessary, but safer to zero */
- req->cqe.res = 0;
-}
-
-static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
-{
- spin_lock(&ctx->completion_lock);
- wq_list_splice(&ctx->locked_free_list, &state->free_list);
- ctx->locked_free_nr = 0;
- spin_unlock(&ctx->completion_lock);
-}
-
-static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
-{
- return !ctx->submit_state.free_list.next;
-}
-
-/*
- * A request might get retired back into the request caches even before opcode
- * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
- * Because of that, io_alloc_req() should be called only under ->uring_lock
- * and with extra caution to not get a request that is still worked on.
- */
-static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
- void *reqs[IO_REQ_ALLOC_BATCH];
- int ret, i;
-
- /*
- * If we have more than a batch's worth of requests in our IRQ side
- * locked cache, grab the lock and move them over to our submission
- * side cache.
- */
- if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
- io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
- if (!io_req_cache_empty(ctx))
- return true;
- }
-
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
-
- /*
- * Bulk alloc is all-or-nothing. If we fail to get a batch,
- * retry single alloc to be on the safe side.
- */
- if (unlikely(ret <= 0)) {
- reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!reqs[0])
- return false;
- ret = 1;
- }
-
- percpu_ref_get_many(&ctx->refs, ret);
- for (i = 0; i < ret; i++) {
- struct io_kiocb *req = reqs[i];
-
- io_preinit_req(req, ctx);
- io_req_add_to_cache(req, ctx);
- }
- return true;
-}
-
-static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
-{
- if (unlikely(io_req_cache_empty(ctx)))
- return __io_alloc_req_refill(ctx);
- return true;
-}
-
-static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
-{
- struct io_wq_work_node *node;
-
- node = wq_stack_extract(&ctx->submit_state.free_list);
- return container_of(node, struct io_kiocb, comp_list);
-}
-
-static inline void io_put_file(struct file *file)
-{
- if (file)
- fput(file);
-}
-
-static inline void io_dismantle_req(struct io_kiocb *req)
-{
- unsigned int flags = req->flags;
-
- if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
- io_clean_op(req);
- if (!(flags & REQ_F_FIXED_FILE))
- io_put_file(req->file);
-}
-
-static __cold void io_free_req(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- io_req_put_rsrc(req);
- io_dismantle_req(req);
- io_put_task(req->task, 1);
-
- spin_lock(&ctx->completion_lock);
- wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
- ctx->locked_free_nr++;
- spin_unlock(&ctx->completion_lock);
-}
-
-static inline void io_remove_next_linked(struct io_kiocb *req)
-{
- struct io_kiocb *nxt = req->link;
-
- req->link = nxt->link;
- nxt->link = NULL;
-}
-
-static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
- __must_hold(&req->ctx->timeout_lock)
-{
- struct io_kiocb *link = req->link;
-
- if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
- struct io_timeout_data *io = link->async_data;
-
- io_remove_next_linked(req);
- link->timeout.head = NULL;
- if (hrtimer_try_to_cancel(&io->timer) != -1) {
- list_del(&link->timeout.list);
- return link;
- }
- }
- return NULL;
-}
-
-static void io_fail_links(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
-{
- struct io_kiocb *nxt, *link = req->link;
- bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
-
- req->link = NULL;
- while (link) {
- long res = -ECANCELED;
-
- if (link->flags & REQ_F_FAIL)
- res = link->cqe.res;
-
- nxt = link->link;
- link->link = NULL;
-
- trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
- req->opcode, link);
-
- if (ignore_cqes)
- link->flags |= REQ_F_CQE_SKIP;
- else
- link->flags &= ~REQ_F_CQE_SKIP;
- __io_req_complete_post(link, res, 0);
- link = nxt;
- }
-}
-
-static bool io_disarm_next(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
-{
- struct io_kiocb *link = NULL;
- bool posted = false;
-
- if (req->flags & REQ_F_ARM_LTIMEOUT) {
- link = req->link;
- req->flags &= ~REQ_F_ARM_LTIMEOUT;
- if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
- io_remove_next_linked(req);
- io_req_tw_post_queue(link, -ECANCELED, 0);
- posted = true;
- }
- } else if (req->flags & REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock_irq(&ctx->timeout_lock);
- link = io_disarm_linked_timeout(req);
- spin_unlock_irq(&ctx->timeout_lock);
- if (link) {
- posted = true;
- io_req_tw_post_queue(link, -ECANCELED, 0);
- }
- }
- if (unlikely((req->flags & REQ_F_FAIL) &&
- !(req->flags & REQ_F_HARDLINK))) {
- posted |= (req->link != NULL);
- io_fail_links(req);
- }
- return posted;
-}
-
-static void __io_req_find_next_prep(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- bool posted;
-
- spin_lock(&ctx->completion_lock);
- posted = io_disarm_next(req);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
-}
-
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
-
- /*
- * If LINK is set, we have dependent requests in this chain. If we
- * didn't fail this request, queue the first one up, moving any other
- * dependencies to the next request. In case of failure, fail the rest
- * of the chain.
- */
- if (unlikely(req->flags & IO_DISARM_MASK))
- __io_req_find_next_prep(req);
- nxt = req->link;
- req->link = NULL;
- return nxt;
-}
-
-static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
-{
- if (!ctx)
- return;
- if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
- atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
- if (*locked) {
- io_submit_flush_completions(ctx);
- mutex_unlock(&ctx->uring_lock);
- *locked = false;
- }
- percpu_ref_put(&ctx->refs);
-}
-
-static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
-{
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-}
-
-static void handle_prev_tw_list(struct io_wq_work_node *node,
- struct io_ring_ctx **ctx, bool *uring_locked)
-{
- if (*ctx && !*uring_locked)
- spin_lock(&(*ctx)->completion_lock);
-
- do {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
- if (req->ctx != *ctx) {
- if (unlikely(!*uring_locked && *ctx))
- ctx_commit_and_unlock(*ctx);
-
- ctx_flush_and_put(*ctx, uring_locked);
- *ctx = req->ctx;
- /* if not contended, grab and improve batching */
- *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
- percpu_ref_get(&(*ctx)->refs);
- if (unlikely(!*uring_locked))
- spin_lock(&(*ctx)->completion_lock);
- }
- if (likely(*uring_locked))
- req->io_task_work.func(req, uring_locked);
- else
- __io_req_complete_post(req, req->cqe.res,
- io_put_kbuf_comp(req));
- node = next;
- } while (node);
-
- if (unlikely(!*uring_locked))
- ctx_commit_and_unlock(*ctx);
-}
-
-static void handle_tw_list(struct io_wq_work_node *node,
- struct io_ring_ctx **ctx, bool *locked)
-{
- do {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
- if (req->ctx != *ctx) {
- ctx_flush_and_put(*ctx, locked);
- *ctx = req->ctx;
- /* if not contended, grab and improve batching */
- *locked = mutex_trylock(&(*ctx)->uring_lock);
- percpu_ref_get(&(*ctx)->refs);
- }
- req->io_task_work.func(req, locked);
- node = next;
- } while (node);
-}
-
-static void tctx_task_work(struct callback_head *cb)
-{
- bool uring_locked = false;
- struct io_ring_ctx *ctx = NULL;
- struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
- task_work);
-
- while (1) {
- struct io_wq_work_node *node1, *node2;
-
- spin_lock_irq(&tctx->task_lock);
- node1 = tctx->prio_task_list.first;
- node2 = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prio_task_list);
- if (!node2 && !node1)
- tctx->task_running = false;
- spin_unlock_irq(&tctx->task_lock);
- if (!node2 && !node1)
- break;
-
- if (node1)
- handle_prev_tw_list(node1, &ctx, &uring_locked);
- if (node2)
- handle_tw_list(node2, &ctx, &uring_locked);
- cond_resched();
-
- if (data_race(!tctx->task_list.first) &&
- data_race(!tctx->prio_task_list.first) && uring_locked)
- io_submit_flush_completions(ctx);
- }
-
- ctx_flush_and_put(ctx, &uring_locked);
-
- /* relaxed read is enough as only the task itself sets ->in_idle */
- if (unlikely(atomic_read(&tctx->in_idle)))
- io_uring_drop_tctx_refs(current);
-}
-
-static void __io_req_task_work_add(struct io_kiocb *req,
- struct io_uring_task *tctx,
- struct io_wq_work_list *list)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_wq_work_node *node;
- unsigned long flags;
- bool running;
-
- spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, list);
- running = tctx->task_running;
- if (!running)
- tctx->task_running = true;
- spin_unlock_irqrestore(&tctx->task_lock, flags);
-
- /* task_work already pending, we're done */
- if (running)
- return;
-
- if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
- atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-
- if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
- return;
-
- spin_lock_irqsave(&tctx->task_lock, flags);
- tctx->task_running = false;
- node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
- spin_unlock_irqrestore(&tctx->task_lock, flags);
-
- while (node) {
- req = container_of(node, struct io_kiocb, io_task_work.node);
- node = node->next;
- if (llist_add(&req->io_task_work.fallback_node,
- &req->ctx->fallback_llist))
- schedule_delayed_work(&req->ctx->fallback_work, 1);
- }
-}
-
-static void io_req_task_work_add(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = req->task->io_uring;
-
- __io_req_task_work_add(req, tctx, &tctx->task_list);
-}
-
-static void io_req_task_prio_work_add(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = req->task->io_uring;
-
- if (req->ctx->flags & IORING_SETUP_SQPOLL)
- __io_req_task_work_add(req, tctx, &tctx->prio_task_list);
- else
- __io_req_task_work_add(req, tctx, &tctx->task_list);
-}
-
-static void io_req_tw_post(struct io_kiocb *req, bool *locked)
-{
- io_req_complete_post(req, req->cqe.res, req->cqe.flags);
-}
-
-static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
-{
- req->cqe.res = res;
- req->cqe.flags = cflags;
- req->io_task_work.func = io_req_tw_post;
- io_req_task_work_add(req);
-}
-
-static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
-{
- /* not needed for normal modes, but SQPOLL depends on it */
- io_tw_lock(req->ctx, locked);
- io_req_complete_failed(req, req->cqe.res);
-}
-
-static void io_req_task_submit(struct io_kiocb *req, bool *locked)
-{
- io_tw_lock(req->ctx, locked);
- /* req->task == current here, checking PF_EXITING is safe */
- if (likely(!(req->task->flags & PF_EXITING)))
- io_queue_sqe(req);
- else
- io_req_complete_failed(req, -EFAULT);
-}
-
-static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
-{
- req->cqe.res = ret;
- req->io_task_work.func = io_req_task_cancel;
- io_req_task_work_add(req);
-}
-
-static void io_req_task_queue(struct io_kiocb *req)
-{
- req->io_task_work.func = io_req_task_submit;
- io_req_task_work_add(req);
-}
-
-static void io_req_task_queue_reissue(struct io_kiocb *req)
-{
- req->io_task_work.func = io_queue_iowq;
- io_req_task_work_add(req);
-}
-
-static void io_queue_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt = io_req_find_next(req);
-
- if (nxt)
- io_req_task_queue(nxt);
-}
-
-static void io_free_batch_list(struct io_ring_ctx *ctx,
- struct io_wq_work_node *node)
- __must_hold(&ctx->uring_lock)
-{
- struct task_struct *task = NULL;
- int task_refs = 0;
-
- do {
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- comp_list);
-
- if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
- if (req->flags & REQ_F_REFCOUNT) {
- node = req->comp_list.next;
- if (!req_ref_put_and_test(req))
- continue;
- }
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- struct async_poll *apoll = req->apoll;
-
- if (apoll->double_poll)
- kfree(apoll->double_poll);
- list_add(&apoll->poll.wait.entry,
- &ctx->apoll_cache);
- req->flags &= ~REQ_F_POLLED;
- }
- if (req->flags & IO_REQ_LINK_FLAGS)
- io_queue_next(req);
- if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
- io_clean_op(req);
- }
- if (!(req->flags & REQ_F_FIXED_FILE))
- io_put_file(req->file);
-
- io_req_put_rsrc_locked(req, ctx);
-
- if (req->task != task) {
- if (task)
- io_put_task(task, task_refs);
- task = req->task;
- task_refs = 0;
- }
- task_refs++;
- node = req->comp_list.next;
- io_req_add_to_cache(req, ctx);
- } while (node);
-
- if (task)
- io_put_task(task, task_refs);
-}
-
-static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- struct io_wq_work_node *node, *prev;
- struct io_submit_state *state = &ctx->submit_state;
-
- if (state->flush_cqes) {
- spin_lock(&ctx->completion_lock);
- wq_list_for_each(node, prev, &state->compl_reqs) {
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- comp_list);
-
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(ctx, req);
- }
-
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
- state->flush_cqes = false;
- }
-
- io_free_batch_list(ctx, state->compl_reqs.first);
- INIT_WQ_LIST(&state->compl_reqs);
-}
-
-/*
- * Drop reference to request, return next in chain (if there is one) if this
- * was the last reference to this request.
- */
-static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt = NULL;
-
- if (req_ref_put_and_test(req)) {
- if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
- nxt = io_req_find_next(req);
- io_free_req(req);
- }
- return nxt;
-}
-
-static inline void io_put_req(struct io_kiocb *req)
-{
- if (req_ref_put_and_test(req)) {
- io_queue_next(req);
- io_free_req(req);
- }
-}
-
-static unsigned io_cqring_events(struct io_ring_ctx *ctx)
-{
- /* See comment at the top of this file */
- smp_rmb();
- return __io_cqring_events(ctx);
-}
-
-static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
-{
- struct io_rings *rings = ctx->rings;
-
- /* make sure SQ entry isn't read before tail */
- return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
-}
-
-static inline bool io_run_task_work(void)
-{
- if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
- __set_current_state(TASK_RUNNING);
- clear_notify_signal();
- if (task_work_pending(current))
- task_work_run();
- return true;
- }
-
- return false;
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
-{
- struct io_wq_work_node *pos, *start, *prev;
- unsigned int poll_flags = BLK_POLL_NOSLEEP;
- DEFINE_IO_COMP_BATCH(iob);
- int nr_events = 0;
-
- /*
- * Only spin for completions if we don't have multiple devices hanging
- * off our complete list.
- */
- if (ctx->poll_multi_queue || force_nonspin)
- poll_flags |= BLK_POLL_ONESHOT;
-
- wq_list_for_each(pos, start, &ctx->iopoll_list) {
- struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
- struct kiocb *kiocb = &req->rw.kiocb;
- int ret;
-
- /*
- * Move completed and retryable entries to our local lists.
- * If we find a request that requires polling, break out
- * and complete those lists first, if we have entries there.
- */
- if (READ_ONCE(req->iopoll_completed))
- break;
-
- ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
- if (unlikely(ret < 0))
- return ret;
- else if (ret)
- poll_flags |= BLK_POLL_ONESHOT;
-
- /* iopoll may have completed current req */
- if (!rq_list_empty(iob.req_list) ||
- READ_ONCE(req->iopoll_completed))
- break;
- }
-
- if (!rq_list_empty(iob.req_list))
- iob.complete(&iob);
- else if (!pos)
- return 0;
-
- prev = start;
- wq_list_for_each_resume(pos, prev) {
- struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-
- /* order with io_complete_rw_iopoll(), e.g. ->result updates */
- if (!smp_load_acquire(&req->iopoll_completed))
- break;
- nr_events++;
- if (unlikely(req->flags & REQ_F_CQE_SKIP))
- continue;
-
- req->cqe.flags = io_put_kbuf(req, 0);
- __io_fill_cqe_req(req->ctx, req);
- }
-
- if (unlikely(!nr_events))
- return 0;
-
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
- pos = start ? start->next : ctx->iopoll_list.first;
- wq_list_cut(&ctx->iopoll_list, prev, start);
- io_free_batch_list(ctx, pos);
- return nr_events;
-}
-
-/*
- * We can't just wait for polled events to come to us, we have to actively
- * find and complete them.
- */
-static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_IOPOLL))
- return;
-
- mutex_lock(&ctx->uring_lock);
- while (!wq_list_empty(&ctx->iopoll_list)) {
- /* let it sleep and repeat later if can't complete a request */
- if (io_do_iopoll(ctx, true) == 0)
- break;
- /*
- * Ensure we allow local-to-the-cpu processing to take place,
- * in this case we need to ensure that we reap all events.
- * Also let task_work, etc. to progress by releasing the mutex
- */
- if (need_resched()) {
- mutex_unlock(&ctx->uring_lock);
- cond_resched();
- mutex_lock(&ctx->uring_lock);
- }
- }
- mutex_unlock(&ctx->uring_lock);
-}
-
-static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
-{
- unsigned int nr_events = 0;
- int ret = 0;
- unsigned long check_cq;
-
- /*
- * Don't enter poll loop if we already have events pending.
- * If we do, we can potentially be spinning for commands that
- * already triggered a CQE (eg in error).
- */
- check_cq = READ_ONCE(ctx->check_cq);
- if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
- __io_cqring_overflow_flush(ctx, false);
- if (io_cqring_events(ctx))
- return 0;
-
- /*
- * Similarly do not spin if we have not informed the user of any
- * dropped CQE.
- */
- if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
- return -EBADR;
-
- do {
- /*
- * If a submit got punted to a workqueue, we can have the
- * application entering polling for a command before it gets
- * issued. That app will hold the uring_lock for the duration
- * of the poll right here, so we need to take a breather every
- * now and then to ensure that the issue has a chance to add
- * the poll to the issued list. Otherwise we can spin here
- * forever, while the workqueue is stuck trying to acquire the
- * very same mutex.
- */
- if (wq_list_empty(&ctx->iopoll_list)) {
- u32 tail = ctx->cached_cq_tail;
-
- mutex_unlock(&ctx->uring_lock);
- io_run_task_work();
- mutex_lock(&ctx->uring_lock);
-
- /* some requests don't go through iopoll_list */
- if (tail != ctx->cached_cq_tail ||
- wq_list_empty(&ctx->iopoll_list))
- break;
- }
- ret = io_do_iopoll(ctx, !min);
- if (ret < 0)
- break;
- nr_events += ret;
- ret = 0;
- } while (nr_events < min && !need_resched());
-
- return ret;
-}
-
-static void kiocb_end_write(struct io_kiocb *req)
-{
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
- if (req->flags & REQ_F_ISREG) {
- struct super_block *sb = file_inode(req->file)->i_sb;
-
- __sb_writers_acquired(sb, SB_FREEZE_WRITE);
- sb_end_write(sb);
- }
-}
-
-#ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req)
-{
- struct io_async_rw *rw = req->async_data;
-
- if (!req_has_async_data(req))
- return !io_req_prep_async(req);
- iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
- return true;
-}
-
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
- umode_t mode = file_inode(req->file)->i_mode;
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!S_ISBLK(mode) && !S_ISREG(mode))
- return false;
- if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
- !(ctx->flags & IORING_SETUP_IOPOLL)))
- return false;
- /*
- * If ref is dying, we might be running poll reap from the exit work.
- * Don't attempt to reissue from that path, just let it fail with
- * -EAGAIN.
- */
- if (percpu_ref_is_dying(&ctx->refs))
- return false;
- /*
- * Play it safe and assume not safe to re-import and reissue if we're
- * not in the original thread group (or in task context).
- */
- if (!same_thread_group(req->task, current) || !in_task())
- return false;
- return true;
-}
-#else
-static bool io_resubmit_prep(struct io_kiocb *req)
-{
- return false;
-}
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
- return false;
-}
-#endif
-
-static bool __io_complete_rw_common(struct io_kiocb *req, long res)
-{
- if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
- kiocb_end_write(req);
- fsnotify_modify(req->file);
- } else {
- fsnotify_access(req->file);
- }
- if (unlikely(res != req->cqe.res)) {
- if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
- io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
- return true;
- }
- req_set_fail(req);
- req->cqe.res = res;
- }
- return false;
-}
-
-static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
-{
- int res = req->cqe.res;
-
- if (*locked) {
- io_req_complete_state(req, res, io_put_kbuf(req, 0));
- io_req_add_compl_list(req);
- } else {
- io_req_complete_post(req, res,
- io_put_kbuf(req, IO_URING_F_UNLOCKED));
- }
-}
-
-static void __io_complete_rw(struct io_kiocb *req, long res,
- unsigned int issue_flags)
-{
- if (__io_complete_rw_common(req, res))
- return;
- __io_req_complete(req, issue_flags, req->cqe.res,
- io_put_kbuf(req, issue_flags));
-}
-
-static void io_complete_rw(struct kiocb *kiocb, long res)
-{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
-
- if (__io_complete_rw_common(req, res))
- return;
- req->cqe.res = res;
- req->io_task_work.func = io_req_task_complete;
- io_req_task_prio_work_add(req);
-}
-
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
-{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
-
- if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
- if (unlikely(res != req->cqe.res)) {
- if (res == -EAGAIN && io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
- return;
- }
- req->cqe.res = res;
- }
-
- /* order with io_iopoll_complete() checking ->iopoll_completed */
- smp_store_release(&req->iopoll_completed, 1);
-}
-
-/*
- * After the iocb has been issued, it's safe to be found on the poll list.
- * Adding the kiocb to the list AFTER submission ensures that we don't
- * find it from a io_do_iopoll() thread before the issuer is done
- * accessing the kiocb cookie.
- */
-static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
-
- /* workqueue context doesn't hold uring_lock, grab it now */
- if (unlikely(needs_lock))
- mutex_lock(&ctx->uring_lock);
-
- /*
- * Track whether we have multiple files in our lists. This will impact
- * how we do polling eventually, not spinning if we're on potentially
- * different devices.
- */
- if (wq_list_empty(&ctx->iopoll_list)) {
- ctx->poll_multi_queue = false;
- } else if (!ctx->poll_multi_queue) {
- struct io_kiocb *list_req;
-
- list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
- comp_list);
- if (list_req->file != req->file)
- ctx->poll_multi_queue = true;
- }
-
- /*
- * For fast devices, IO may have already completed. If it has, add
- * it to the front so we find it first.
- */
- if (READ_ONCE(req->iopoll_completed))
- wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
- else
- wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
-
- if (unlikely(needs_lock)) {
- /*
- * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
- * in sq thread task context or in io worker task context. If
- * current task context is sq thread, we don't need to check
- * whether should wake up sq thread.
- */
- if ((ctx->flags & IORING_SETUP_SQPOLL) &&
- wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
-
- mutex_unlock(&ctx->uring_lock);
- }
-}
-
-static bool io_bdev_nowait(struct block_device *bdev)
-{
- return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
-}
-
-/*
- * If we tracked the file through the SCM inflight mechanism, we could support
- * any file. For now, just ensure that anything potentially problematic is done
- * inline.
- */
-static bool __io_file_supports_nowait(struct file *file, umode_t mode)
-{
- if (S_ISBLK(mode)) {
- if (IS_ENABLED(CONFIG_BLOCK) &&
- io_bdev_nowait(I_BDEV(file->f_mapping->host)))
- return true;
- return false;
- }
- if (S_ISSOCK(mode))
- return true;
- if (S_ISREG(mode)) {
- if (IS_ENABLED(CONFIG_BLOCK) &&
- io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
- file->f_op != &io_uring_fops)
- return true;
- return false;
- }
-
- /* any ->read/write should understand O_NONBLOCK */
- if (file->f_flags & O_NONBLOCK)
- return true;
- return file->f_mode & FMODE_NOWAIT;
-}
-
-/*
- * If we tracked the file through the SCM inflight mechanism, we could support
- * any file. For now, just ensure that anything potentially problematic is done
- * inline.
- */
-static unsigned int io_file_get_flags(struct file *file)
-{
- umode_t mode = file_inode(file)->i_mode;
- unsigned int res = 0;
-
- if (S_ISREG(mode))
- res |= FFS_ISREG;
- if (__io_file_supports_nowait(file, mode))
- res |= FFS_NOWAIT;
- if (io_file_need_scm(file))
- res |= FFS_SCM;
- return res;
-}
-
-static inline bool io_file_supports_nowait(struct io_kiocb *req)
-{
- return req->flags & REQ_F_SUPPORT_NOWAIT;
-}
-
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct kiocb *kiocb = &req->rw.kiocb;
- unsigned ioprio;
- int ret;
-
- kiocb->ki_pos = READ_ONCE(sqe->off);
- /* used for fixed read/write too - just read unconditionally */
- req->buf_index = READ_ONCE(sqe->buf_index);
-
- if (req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED) {
- struct io_ring_ctx *ctx = req->ctx;
- u16 index;
-
- if (unlikely(req->buf_index >= ctx->nr_user_bufs))
- return -EFAULT;
- index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
- req->imu = ctx->user_bufs[index];
- io_req_set_rsrc_node(req, ctx, 0);
- }
-
- ioprio = READ_ONCE(sqe->ioprio);
- if (ioprio) {
- ret = ioprio_check_cap(ioprio);
- if (ret)
- return ret;
-
- kiocb->ki_ioprio = ioprio;
- } else {
- kiocb->ki_ioprio = get_current_ioprio();
- }
-
- req->rw.addr = READ_ONCE(sqe->addr);
- req->rw.len = READ_ONCE(sqe->len);
- req->rw.flags = READ_ONCE(sqe->rw_flags);
- return 0;
-}
-
-static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
-{
- switch (ret) {
- case -EIOCBQUEUED:
- break;
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- case -ERESTARTNOHAND:
- case -ERESTART_RESTARTBLOCK:
- /*
- * We can't just restart the syscall, since previously
- * submitted sqes may already be in progress. Just fail this
- * IO with EINTR.
- */
- ret = -EINTR;
- fallthrough;
- default:
- kiocb->ki_complete(kiocb, ret);
- }
-}
-
-static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
-{
- struct kiocb *kiocb = &req->rw.kiocb;
-
- if (kiocb->ki_pos != -1)
- return &kiocb->ki_pos;
-
- if (!(req->file->f_mode & FMODE_STREAM)) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = req->file->f_pos;
- return &kiocb->ki_pos;
- }
-
- kiocb->ki_pos = 0;
- return NULL;
-}
-
-static void kiocb_done(struct io_kiocb *req, ssize_t ret,
- unsigned int issue_flags)
-{
- struct io_async_rw *io = req->async_data;
-
- /* add previously done IO, if any */
- if (req_has_async_data(req) && io->bytes_done > 0) {
- if (ret < 0)
- ret = io->bytes_done;
- else
- ret += io->bytes_done;
- }
-
- if (req->flags & REQ_F_CUR_POS)
- req->file->f_pos = req->rw.kiocb.ki_pos;
- if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
- __io_complete_rw(req, ret, issue_flags);
- else
- io_rw_done(&req->rw.kiocb, ret);
-
- if (req->flags & REQ_F_REISSUE) {
- req->flags &= ~REQ_F_REISSUE;
- if (io_resubmit_prep(req))
- io_req_task_queue_reissue(req);
- else
- io_req_task_queue_fail(req, ret);
- }
-}
-
-static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
- struct io_mapped_ubuf *imu)
-{
- size_t len = req->rw.len;
- u64 buf_end, buf_addr = req->rw.addr;
- size_t offset;
-
- if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
- return -EFAULT;
- /* not inside the mapped region */
- if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
- return -EFAULT;
-
- /*
- * May not be a start of buffer, set size appropriately
- * and advance us to the beginning.
- */
- offset = buf_addr - imu->ubuf;
- iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
-
- if (offset) {
- /*
- * Don't use iov_iter_advance() here, as it's really slow for
- * using the latter parts of a big fixed buffer - it iterates
- * over each segment manually. We can cheat a bit here, because
- * we know that:
- *
- * 1) it's a BVEC iter, we set it up
- * 2) all bvecs are PAGE_SIZE in size, except potentially the
- * first and last bvec
- *
- * So just find our index, and adjust the iterator afterwards.
- * If the offset is within the first bvec (or the whole first
- * bvec, just use iov_iter_advance(). This makes it easier
- * since we can just skip the first segment, which may not
- * be PAGE_SIZE aligned.
- */
- const struct bio_vec *bvec = imu->bvec;
-
- if (offset <= bvec->bv_len) {
- iov_iter_advance(iter, offset);
- } else {
- unsigned long seg_skip;
-
- /* skip first vec */
- offset -= bvec->bv_len;
- seg_skip = 1 + (offset >> PAGE_SHIFT);
-
- iter->bvec = bvec + seg_skip;
- iter->nr_segs -= seg_skip;
- iter->count -= bvec->bv_len + offset;
- iter->iov_offset = offset & ~PAGE_MASK;
- }
- }
-
- return 0;
-}
-
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
- unsigned int issue_flags)
-{
- if (WARN_ON_ONCE(!req->imu))
- return -EFAULT;
- return __io_import_fixed(req, rw, iter, req->imu);
-}
-
-static int io_buffer_add_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned int bgid)
-{
- bl->bgid = bgid;
- if (bgid < BGID_ARRAY)
- return 0;
-
- return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
-}
-
-static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
- struct io_buffer_list *bl)
-{
- if (!list_empty(&bl->buf_list)) {
- struct io_buffer *kbuf;
-
- kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
- list_del(&kbuf->list);
- if (*len > kbuf->len)
- *len = kbuf->len;
- req->flags |= REQ_F_BUFFER_SELECTED;
- req->kbuf = kbuf;
- req->buf_index = kbuf->bid;
- return u64_to_user_ptr(kbuf->addr);
- }
- return NULL;
-}
-
-static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
- struct io_buffer_list *bl,
- unsigned int issue_flags)
-{
- struct io_uring_buf_ring *br = bl->buf_ring;
- struct io_uring_buf *buf;
- __u16 head = bl->head;
-
- if (unlikely(smp_load_acquire(&br->tail) == head))
- return NULL;
-
- head &= bl->mask;
- if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
- buf = &br->bufs[head];
- } else {
- int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
- int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
- buf = page_address(bl->buf_pages[index]);
- buf += off;
- }
- if (*len > buf->len)
- *len = buf->len;
- req->flags |= REQ_F_BUFFER_RING;
- req->buf_list = bl;
- req->buf_index = buf->bid;
-
- if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
- /*
- * If we came in unlocked, we have no choice but to consume the
- * buffer here. This does mean it'll be pinned until the IO
- * completes. But coming in unlocked means we're in io-wq
- * context, hence there should be no further retry. For the
- * locked case, the caller must ensure to call the commit when
- * the transfer completes (or if we get -EAGAIN and must poll
- * or retry).
- */
- req->buf_list = NULL;
- bl->head++;
- }
- return u64_to_user_ptr(buf->addr);
-}
-
-static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- void __user *ret = NULL;
-
- io_ring_submit_lock(req->ctx, issue_flags);
-
- bl = io_buffer_get_list(ctx, req->buf_index);
- if (likely(bl)) {
- if (bl->buf_nr_pages)
- ret = io_ring_buffer_select(req, len, bl, issue_flags);
- else
- ret = io_provided_buffer_select(req, len, bl);
- }
- io_ring_submit_unlock(req->ctx, issue_flags);
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
- unsigned int issue_flags)
-{
- struct compat_iovec __user *uiov;
- compat_ssize_t clen;
- void __user *buf;
- size_t len;
-
- uiov = u64_to_user_ptr(req->rw.addr);
- if (!access_ok(uiov, sizeof(*uiov)))
- return -EFAULT;
- if (__get_user(clen, &uiov->iov_len))
- return -EFAULT;
- if (clen < 0)
- return -EINVAL;
-
- len = clen;
- buf = io_buffer_select(req, &len, issue_flags);
- if (!buf)
- return -ENOBUFS;
- req->rw.addr = (unsigned long) buf;
- iov[0].iov_base = buf;
- req->rw.len = iov[0].iov_len = (compat_size_t) len;
- return 0;
-}
-#endif
-
-static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- unsigned int issue_flags)
-{
- struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
- void __user *buf;
- ssize_t len;
-
- if (copy_from_user(iov, uiov, sizeof(*uiov)))
- return -EFAULT;
-
- len = iov[0].iov_len;
- if (len < 0)
- return -EINVAL;
- buf = io_buffer_select(req, &len, issue_flags);
- if (!buf)
- return -ENOBUFS;
- req->rw.addr = (unsigned long) buf;
- iov[0].iov_base = buf;
- req->rw.len = iov[0].iov_len = len;
- return 0;
-}
-
-static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- unsigned int issue_flags)
-{
- if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
- iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
- iov[0].iov_len = req->rw.len;
- return 0;
- }
- if (req->rw.len != 1)
- return -EINVAL;
-
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return io_compat_import(req, iov, issue_flags);
-#endif
-
- return __io_iov_buffer_select(req, iov, issue_flags);
-}
-
-static inline bool io_do_buffer_select(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- return false;
- return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
-}
-
-static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
- struct io_rw_state *s,
- unsigned int issue_flags)
-{
- struct iov_iter *iter = &s->iter;
- u8 opcode = req->opcode;
- struct iovec *iovec;
- void __user *buf;
- size_t sqe_len;
- ssize_t ret;
-
- if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- ret = io_import_fixed(req, rw, iter, issue_flags);
- if (ret)
- return ERR_PTR(ret);
- return NULL;
- }
-
- buf = u64_to_user_ptr(req->rw.addr);
- sqe_len = req->rw.len;
-
- if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
- if (io_do_buffer_select(req)) {
- buf = io_buffer_select(req, &sqe_len, issue_flags);
- if (!buf)
- return ERR_PTR(-ENOBUFS);
- req->rw.addr = (unsigned long) buf;
- req->rw.len = sqe_len;
- }
-
- ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
- if (ret)
- return ERR_PTR(ret);
- return NULL;
- }
-
- iovec = s->fast_iov;
- if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select(req, iovec, issue_flags);
- if (ret)
- return ERR_PTR(ret);
- iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
- return NULL;
- }
-
- ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
- req->ctx->compat);
- if (unlikely(ret < 0))
- return ERR_PTR(ret);
- return iovec;
-}
-
-static inline int io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct io_rw_state *s,
- unsigned int issue_flags)
-{
- *iovec = __io_import_iovec(rw, req, s, issue_flags);
- if (unlikely(IS_ERR(*iovec)))
- return PTR_ERR(*iovec);
-
- iov_iter_save_state(&s->iter, &s->iter_state);
- return 0;
-}
-
-static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
-{
- return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
-}
-
-/*
- * For files that don't have ->read_iter() and ->write_iter(), handle them
- * by looping over ->read() or ->write() manually.
- */
-static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
-{
- struct kiocb *kiocb = &req->rw.kiocb;
- struct file *file = req->file;
- ssize_t ret = 0;
- loff_t *ppos;
-
- /*
- * Don't support polled IO through this interface, and we can't
- * support non-blocking either. For the latter, this just causes
- * the kiocb to be handled from an async context.
- */
- if (kiocb->ki_flags & IOCB_HIPRI)
- return -EOPNOTSUPP;
- if ((kiocb->ki_flags & IOCB_NOWAIT) &&
- !(kiocb->ki_filp->f_flags & O_NONBLOCK))
- return -EAGAIN;
-
- ppos = io_kiocb_ppos(kiocb);
-
- while (iov_iter_count(iter)) {
- struct iovec iovec;
- ssize_t nr;
-
- if (!iov_iter_is_bvec(iter)) {
- iovec = iov_iter_iovec(iter);
- } else {
- iovec.iov_base = u64_to_user_ptr(req->rw.addr);
- iovec.iov_len = req->rw.len;
- }
-
- if (rw == READ) {
- nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, ppos);
- } else {
- nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, ppos);
- }
-
- if (nr < 0) {
- if (!ret)
- ret = nr;
- break;
- }
- ret += nr;
- if (!iov_iter_is_bvec(iter)) {
- iov_iter_advance(iter, nr);
- } else {
- req->rw.addr += nr;
- req->rw.len -= nr;
- if (!req->rw.len)
- break;
- }
- if (nr != iovec.iov_len)
- break;
- }
-
- return ret;
-}
-
-static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov, struct iov_iter *iter)
-{
- struct io_async_rw *rw = req->async_data;
-
- memcpy(&rw->s.iter, iter, sizeof(*iter));
- rw->free_iovec = iovec;
- rw->bytes_done = 0;
- /* can only be fixed buffers, no need to do anything */
- if (iov_iter_is_bvec(iter))
- return;
- if (!iovec) {
- unsigned iov_off = 0;
-
- rw->s.iter.iov = rw->s.fast_iov;
- if (iter->iov != fast_iov) {
- iov_off = iter->iov - fast_iov;
- rw->s.iter.iov += iov_off;
- }
- if (rw->s.fast_iov != fast_iov)
- memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
- sizeof(struct iovec) * iter->nr_segs);
- } else {
- req->flags |= REQ_F_NEED_CLEANUP;
- }
-}
-
-static inline bool io_alloc_async_data(struct io_kiocb *req)
-{
- WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
- req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
- if (req->async_data) {
- req->flags |= REQ_F_ASYNC_DATA;
- return false;
- }
- return true;
-}
-
-static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- struct io_rw_state *s, bool force)
-{
- if (!force && !io_op_defs[req->opcode].needs_async_setup)
- return 0;
- if (!req_has_async_data(req)) {
- struct io_async_rw *iorw;
-
- if (io_alloc_async_data(req)) {
- kfree(iovec);
- return -ENOMEM;
- }
-
- io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
- iorw = req->async_data;
- /* we've copied and mapped the iter, ensure state is saved */
- iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
- }
- return 0;
-}
-
-static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
-{
- struct io_async_rw *iorw = req->async_data;
- struct iovec *iov;
- int ret;
-
- /* submission path, ->uring_lock should already be taken */
- ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
- if (unlikely(ret < 0))
- return ret;
-
- iorw->bytes_done = 0;
- iorw->free_iovec = iov;
- if (iov)
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_readv_prep_async(struct io_kiocb *req)
-{
- return io_rw_prep_async(req, READ);
-}
-
-static int io_writev_prep_async(struct io_kiocb *req)
-{
- return io_rw_prep_async(req, WRITE);
-}
-
-/*
- * This is our waitqueue callback handler, registered through __folio_lock_async()
- * when we initially tried to do the IO with the iocb armed our waitqueue.
- * This gets called when the page is unlocked, and we generally expect that to
- * happen when the page IO is completed and the page is now uptodate. This will
- * queue a task_work based retry of the operation, attempting to copy the data
- * again. If the latter fails because the page was NOT uptodate, then we will
- * do a thread based blocking retry of the operation. That's the unexpected
- * slow path.
- */
-static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
- int sync, void *arg)
-{
- struct wait_page_queue *wpq;
- struct io_kiocb *req = wait->private;
- struct wait_page_key *key = arg;
-
- wpq = container_of(wait, struct wait_page_queue, wait);
-
- if (!wake_page_match(wpq, key))
- return 0;
-
- req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
- list_del_init(&wait->entry);
- io_req_task_queue(req);
- return 1;
-}
-
-/*
- * This controls whether a given IO request should be armed for async page
- * based retry. If we return false here, the request is handed to the async
- * worker threads for retry. If we're doing buffered reads on a regular file,
- * we prepare a private wait_page_queue entry and retry the operation. This
- * will either succeed because the page is now uptodate and unlocked, or it
- * will register a callback when the page is unlocked at IO completion. Through
- * that callback, io_uring uses task_work to setup a retry of the operation.
- * That retry will attempt the buffered read again. The retry will generally
- * succeed, or in rare cases where it fails, we then fall back to using the
- * async worker threads for a blocking retry.
- */
-static bool io_rw_should_retry(struct io_kiocb *req)
-{
- struct io_async_rw *rw = req->async_data;
- struct wait_page_queue *wait = &rw->wpq;
- struct kiocb *kiocb = &req->rw.kiocb;
-
- /* never retry for NOWAIT, we just complete with -EAGAIN */
- if (req->flags & REQ_F_NOWAIT)
- return false;
-
- /* Only for buffered IO */
- if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
- return false;
-
- /*
- * just use poll if we can, and don't attempt if the fs doesn't
- * support callback based unlocks
- */
- if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
- return false;
-
- wait->wait.func = io_async_buf_func;
- wait->wait.private = req;
- wait->wait.flags = 0;
- INIT_LIST_HEAD(&wait->wait.entry);
- kiocb->ki_flags |= IOCB_WAITQ;
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- kiocb->ki_waitq = wait;
- return true;
-}
-
-static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
-{
- if (likely(req->file->f_op->read_iter))
- return call_read_iter(req->file, &req->rw.kiocb, iter);
- else if (req->file->f_op->read)
- return loop_rw_iter(READ, req, iter);
- else
- return -EINVAL;
-}
-
-static bool need_read_all(struct io_kiocb *req)
-{
- return req->flags & REQ_F_ISREG ||
- S_ISBLK(file_inode(req->file)->i_mode);
-}
-
-static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
-{
- struct kiocb *kiocb = &req->rw.kiocb;
- struct io_ring_ctx *ctx = req->ctx;
- struct file *file = req->file;
- int ret;
-
- if (unlikely(!file || !(file->f_mode & mode)))
- return -EBADF;
-
- if (!io_req_ffs_set(req))
- req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
-
- kiocb->ki_flags = iocb_flags(file);
- ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
- if (unlikely(ret))
- return ret;
-
- /*
- * If the file is marked O_NONBLOCK, still allow retry for it if it
- * supports async. Otherwise it's impossible to use O_NONBLOCK files
- * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
- */
- if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
- req->flags |= REQ_F_NOWAIT;
-
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
- return -EOPNOTSUPP;
-
- kiocb->private = NULL;
- kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
- kiocb->ki_complete = io_complete_rw_iopoll;
- req->iopoll_completed = 0;
- } else {
- if (kiocb->ki_flags & IOCB_HIPRI)
- return -EINVAL;
- kiocb->ki_complete = io_complete_rw;
- }
-
- return 0;
-}
-
-static int io_read(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_rw_state __s, *s = &__s;
- struct iovec *iovec;
- struct kiocb *kiocb = &req->rw.kiocb;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct io_async_rw *rw;
- ssize_t ret, ret2;
- loff_t *ppos;
-
- if (!req_has_async_data(req)) {
- ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
- if (unlikely(ret < 0))
- return ret;
- } else {
- /*
- * Safe and required to re-import if we're using provided
- * buffers, as we dropped the selected one before retry.
- */
- if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
- if (unlikely(ret < 0))
- return ret;
- }
-
- rw = req->async_data;
- s = &rw->s;
- /*
- * We come here from an earlier attempt, restore our state to
- * match in case it doesn't. It's cheap enough that we don't
- * need to make this conditional.
- */
- iov_iter_restore(&s->iter, &s->iter_state);
- iovec = NULL;
- }
- ret = io_rw_init_file(req, FMODE_READ);
- if (unlikely(ret)) {
- kfree(iovec);
- return ret;
- }
- req->cqe.res = iov_iter_count(&s->iter);
-
- if (force_nonblock) {
- /* If the file doesn't support async, just async punt */
- if (unlikely(!io_file_supports_nowait(req))) {
- ret = io_setup_async_rw(req, iovec, s, true);
- return ret ?: -EAGAIN;
- }
- kiocb->ki_flags |= IOCB_NOWAIT;
- } else {
- /* Ensure we clear previously set non-block flag */
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- }
-
- ppos = io_kiocb_update_pos(req);
-
- ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
- if (unlikely(ret)) {
- kfree(iovec);
- return ret;
- }
-
- ret = io_iter_do_read(req, &s->iter);
-
- if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
- req->flags &= ~REQ_F_REISSUE;
- /* if we can poll, just do that */
- if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
- return -EAGAIN;
- /* IOPOLL retry should happen for io-wq threads */
- if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
- goto done;
- /* no retry on NONBLOCK nor RWF_NOWAIT */
- if (req->flags & REQ_F_NOWAIT)
- goto done;
- ret = 0;
- } else if (ret == -EIOCBQUEUED) {
- goto out_free;
- } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
- (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
- /* read all, failed, already did sync or don't want to retry */
- goto done;
- }
-
- /*
- * Don't depend on the iter state matching what was consumed, or being
- * untouched in case of error. Restore it and we'll advance it
- * manually if we need to.
- */
- iov_iter_restore(&s->iter, &s->iter_state);
-
- ret2 = io_setup_async_rw(req, iovec, s, true);
- if (ret2)
- return ret2;
-
- iovec = NULL;
- rw = req->async_data;
- s = &rw->s;
- /*
- * Now use our persistent iterator and state, if we aren't already.
- * We've restored and mapped the iter to match.
- */
-
- do {
- /*
- * We end up here because of a partial read, either from
- * above or inside this loop. Advance the iter by the bytes
- * that were consumed.
- */
- iov_iter_advance(&s->iter, ret);
- if (!iov_iter_count(&s->iter))
- break;
- rw->bytes_done += ret;
- iov_iter_save_state(&s->iter, &s->iter_state);
-
- /* if we can retry, do so with the callbacks armed */
- if (!io_rw_should_retry(req)) {
- kiocb->ki_flags &= ~IOCB_WAITQ;
- return -EAGAIN;
- }
-
- /*
- * Now retry read with the IOCB_WAITQ parts set in the iocb. If
- * we get -EIOCBQUEUED, then we'll get a notification when the
- * desired page gets unlocked. We can also get a partial read
- * here, and if we do, then just retry at the new offset.
- */
- ret = io_iter_do_read(req, &s->iter);
- if (ret == -EIOCBQUEUED)
- return 0;
- /* we got some bytes, but not all. retry. */
- kiocb->ki_flags &= ~IOCB_WAITQ;
- iov_iter_restore(&s->iter, &s->iter_state);
- } while (ret > 0);
-done:
- kiocb_done(req, ret, issue_flags);
-out_free:
- /* it's faster to check here then delegate to kfree */
- if (iovec)
- kfree(iovec);
- return 0;
-}
-
-static int io_write(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_rw_state __s, *s = &__s;
- struct iovec *iovec;
- struct kiocb *kiocb = &req->rw.kiocb;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- ssize_t ret, ret2;
- loff_t *ppos;
-
- if (!req_has_async_data(req)) {
- ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
- if (unlikely(ret < 0))
- return ret;
- } else {
- struct io_async_rw *rw = req->async_data;
-
- s = &rw->s;
- iov_iter_restore(&s->iter, &s->iter_state);
- iovec = NULL;
- }
- ret = io_rw_init_file(req, FMODE_WRITE);
- if (unlikely(ret)) {
- kfree(iovec);
- return ret;
- }
- req->cqe.res = iov_iter_count(&s->iter);
-
- if (force_nonblock) {
- /* If the file doesn't support async, just async punt */
- if (unlikely(!io_file_supports_nowait(req)))
- goto copy_iov;
-
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
- goto copy_iov;
-
- kiocb->ki_flags |= IOCB_NOWAIT;
- } else {
- /* Ensure we clear previously set non-block flag */
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- }
-
- ppos = io_kiocb_update_pos(req);
-
- ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
- if (unlikely(ret))
- goto out_free;
-
- /*
- * Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in
- * io_complete_rw(). Fool lockdep by telling it the lock got
- * released so that it doesn't complain about the held lock when
- * we return to userspace.
- */
- if (req->flags & REQ_F_ISREG) {
- sb_start_write(file_inode(req->file)->i_sb);
- __sb_writers_release(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE);
- }
- kiocb->ki_flags |= IOCB_WRITE;
-
- if (likely(req->file->f_op->write_iter))
- ret2 = call_write_iter(req->file, kiocb, &s->iter);
- else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req, &s->iter);
- else
- ret2 = -EINVAL;
-
- if (req->flags & REQ_F_REISSUE) {
- req->flags &= ~REQ_F_REISSUE;
- ret2 = -EAGAIN;
- }
-
- /*
- * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
- * retry them without IOCB_NOWAIT.
- */
- if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
- ret2 = -EAGAIN;
- /* no retry on NONBLOCK nor RWF_NOWAIT */
- if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
- goto done;
- if (!force_nonblock || ret2 != -EAGAIN) {
- /* IOPOLL retry should happen for io-wq threads */
- if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
- goto copy_iov;
-done:
- kiocb_done(req, ret2, issue_flags);
- } else {
-copy_iov:
- iov_iter_restore(&s->iter, &s->iter_state);
- ret = io_setup_async_rw(req, iovec, s, false);
- return ret ?: -EAGAIN;
- }
-out_free:
- /* it's reportedly faster than delegating the null check to kfree() */
- if (iovec)
- kfree(iovec);
- return ret;
-}
-
-static int io_renameat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_rename *ren = &req->rename;
- const char __user *oldf, *newf;
-
- if (sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- ren->old_dfd = READ_ONCE(sqe->fd);
- oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- ren->new_dfd = READ_ONCE(sqe->len);
- ren->flags = READ_ONCE(sqe->rename_flags);
-
- ren->oldpath = getname(oldf);
- if (IS_ERR(ren->oldpath))
- return PTR_ERR(ren->oldpath);
-
- ren->newpath = getname(newf);
- if (IS_ERR(ren->newpath)) {
- putname(ren->oldpath);
- return PTR_ERR(ren->newpath);
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_rename *ren = &req->rename;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
- ren->newpath, ren->flags);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- io_req_complete(req, ret);
- return 0;
-}
-
-static inline void __io_xattr_finish(struct io_kiocb *req)
-{
- struct io_xattr *ix = &req->xattr;
-
- if (ix->filename)
- putname(ix->filename);
-
- kfree(ix->ctx.kname);
- kvfree(ix->ctx.kvalue);
-}
-
-static void io_xattr_finish(struct io_kiocb *req, int ret)
-{
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
- __io_xattr_finish(req);
- io_req_complete(req, ret);
-}
-
-static int __io_getxattr_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_xattr *ix = &req->xattr;
- const char __user *name;
- int ret;
-
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- ix->filename = NULL;
- ix->ctx.kvalue = NULL;
- name = u64_to_user_ptr(READ_ONCE(sqe->addr));
- ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- ix->ctx.size = READ_ONCE(sqe->len);
- ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
-
- if (ix->ctx.flags)
- return -EINVAL;
-
- ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
- if (!ix->ctx.kname)
- return -ENOMEM;
-
- ret = strncpy_from_user(ix->ctx.kname->name, name,
- sizeof(ix->ctx.kname->name));
- if (!ret || ret == sizeof(ix->ctx.kname->name))
- ret = -ERANGE;
- if (ret < 0) {
- kfree(ix->ctx.kname);
- return ret;
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_fgetxattr_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- return __io_getxattr_prep(req, sqe);
-}
-
-static int io_getxattr_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_xattr *ix = &req->xattr;
- const char __user *path;
- int ret;
-
- ret = __io_getxattr_prep(req, sqe);
- if (ret)
- return ret;
-
- path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
-
- ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
- if (IS_ERR(ix->filename)) {
- ret = PTR_ERR(ix->filename);
- ix->filename = NULL;
- }
-
- return ret;
-}
-
-static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_xattr *ix = &req->xattr;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
- req->file->f_path.dentry,
- &ix->ctx);
-
- io_xattr_finish(req, ret);
- return 0;
-}
-
-static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_xattr *ix = &req->xattr;
- unsigned int lookup_flags = LOOKUP_FOLLOW;
- struct path path;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
-retry:
- ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
- if (!ret) {
- ret = do_getxattr(mnt_user_ns(path.mnt),
- path.dentry,
- &ix->ctx);
-
- path_put(&path);
- if (retry_estale(ret, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- }
-
- io_xattr_finish(req, ret);
- return 0;
-}
-
-static int __io_setxattr_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_xattr *ix = &req->xattr;
- const char __user *name;
- int ret;
-
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- ix->filename = NULL;
- name = u64_to_user_ptr(READ_ONCE(sqe->addr));
- ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- ix->ctx.kvalue = NULL;
- ix->ctx.size = READ_ONCE(sqe->len);
- ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
-
- ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
- if (!ix->ctx.kname)
- return -ENOMEM;
-
- ret = setxattr_copy(name, &ix->ctx);
- if (ret) {
- kfree(ix->ctx.kname);
- return ret;
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_setxattr_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_xattr *ix = &req->xattr;
- const char __user *path;
- int ret;
-
- ret = __io_setxattr_prep(req, sqe);
- if (ret)
- return ret;
-
- path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
-
- ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
- if (IS_ERR(ix->filename)) {
- ret = PTR_ERR(ix->filename);
- ix->filename = NULL;
- }
-
- return ret;
-}
-
-static int io_fsetxattr_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- return __io_setxattr_prep(req, sqe);
-}
-
-static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
- struct path *path)
-{
- struct io_xattr *ix = &req->xattr;
- int ret;
-
- ret = mnt_want_write(path->mnt);
- if (!ret) {
- ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
- mnt_drop_write(path->mnt);
- }
-
- return ret;
-}
-
-static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = __io_setxattr(req, issue_flags, &req->file->f_path);
- io_xattr_finish(req, ret);
-
- return 0;
-}
-
-static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_xattr *ix = &req->xattr;
- unsigned int lookup_flags = LOOKUP_FOLLOW;
- struct path path;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
-retry:
- ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
- if (!ret) {
- ret = __io_setxattr(req, issue_flags, &path);
- path_put(&path);
- if (retry_estale(ret, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- }
-
- io_xattr_finish(req, ret);
- return 0;
-}
-
-static int io_unlinkat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_unlink *un = &req->unlink;
- const char __user *fname;
-
- if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- un->dfd = READ_ONCE(sqe->fd);
-
- un->flags = READ_ONCE(sqe->unlink_flags);
- if (un->flags & ~AT_REMOVEDIR)
- return -EINVAL;
-
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- un->filename = getname(fname);
- if (IS_ERR(un->filename))
- return PTR_ERR(un->filename);
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_unlink *un = &req->unlink;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- if (un->flags & AT_REMOVEDIR)
- ret = do_rmdir(un->dfd, un->filename);
- else
- ret = do_unlinkat(un->dfd, un->filename);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_mkdirat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_mkdir *mkd = &req->mkdir;
- const char __user *fname;
-
- if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- mkd->dfd = READ_ONCE(sqe->fd);
- mkd->mode = READ_ONCE(sqe->len);
-
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- mkd->filename = getname(fname);
- if (IS_ERR(mkd->filename))
- return PTR_ERR(mkd->filename);
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_mkdir *mkd = &req->mkdir;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_symlinkat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_symlink *sl = &req->symlink;
- const char __user *oldpath, *newpath;
-
- if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- sl->new_dfd = READ_ONCE(sqe->fd);
- oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-
- sl->oldpath = getname(oldpath);
- if (IS_ERR(sl->oldpath))
- return PTR_ERR(sl->oldpath);
-
- sl->newpath = getname(newpath);
- if (IS_ERR(sl->newpath)) {
- putname(sl->oldpath);
- return PTR_ERR(sl->newpath);
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_symlink *sl = &req->symlink;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_linkat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_hardlink *lnk = &req->hardlink;
- const char __user *oldf, *newf;
-
- if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- lnk->old_dfd = READ_ONCE(sqe->fd);
- lnk->new_dfd = READ_ONCE(sqe->len);
- oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- lnk->flags = READ_ONCE(sqe->hardlink_flags);
-
- lnk->oldpath = getname(oldf);
- if (IS_ERR(lnk->oldpath))
- return PTR_ERR(lnk->oldpath);
-
- lnk->newpath = getname(newf);
- if (IS_ERR(lnk->newpath)) {
- putname(lnk->oldpath);
- return PTR_ERR(lnk->newpath);
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_hardlink *lnk = &req->hardlink;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
- lnk->newpath, lnk->flags);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- io_req_complete(req, ret);
- return 0;
-}
-
-static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
-{
- req->uring_cmd.task_work_cb(&req->uring_cmd);
-}
-
-void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *))
-{
- struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
-
- req->uring_cmd.task_work_cb = task_work_cb;
- req->io_task_work.func = io_uring_cmd_work;
- io_req_task_work_add(req);
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
-
-static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
- u64 extra1, u64 extra2)
-{
- req->extra1 = extra1;
- req->extra2 = extra2;
- req->flags |= REQ_F_CQE32_INIT;
-}
-
-/*
- * Called by consumers of io_uring_cmd, if they originally returned
- * -EIOCBQUEUED upon receiving the command.
- */
-void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
-{
- struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
-
- if (ret < 0)
- req_set_fail(req);
-
- if (req->ctx->flags & IORING_SETUP_CQE32)
- io_req_set_cqe32_extra(req, res2, 0);
- io_req_complete(req, ret);
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_done);
-
-static int io_uring_cmd_prep_async(struct io_kiocb *req)
-{
- size_t cmd_size;
-
- cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
-
- memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
- return 0;
-}
-
-static int io_uring_cmd_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_uring_cmd *ioucmd = &req->uring_cmd;
-
- if (sqe->rw_flags)
- return -EINVAL;
- ioucmd->cmd = sqe->cmd;
- ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
- return 0;
-}
-
-static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_uring_cmd *ioucmd = &req->uring_cmd;
- struct io_ring_ctx *ctx = req->ctx;
- struct file *file = req->file;
- int ret;
-
- if (!req->file->f_op->uring_cmd)
- return -EOPNOTSUPP;
-
- if (ctx->flags & IORING_SETUP_SQE128)
- issue_flags |= IO_URING_F_SQE128;
- if (ctx->flags & IORING_SETUP_CQE32)
- issue_flags |= IO_URING_F_CQE32;
- if (ctx->flags & IORING_SETUP_IOPOLL)
- issue_flags |= IO_URING_F_IOPOLL;
-
- if (req_has_async_data(req))
- ioucmd->cmd = req->async_data;
-
- ret = file->f_op->uring_cmd(ioucmd, issue_flags);
- if (ret == -EAGAIN) {
- if (!req_has_async_data(req)) {
- if (io_alloc_async_data(req))
- return -ENOMEM;
- io_uring_cmd_prep_async(req);
- }
- return -EAGAIN;
- }
-
- if (ret != -EIOCBQUEUED)
- io_uring_cmd_done(ioucmd, ret, 0);
- return 0;
-}
-
-static int __io_splice_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_splice *sp = &req->splice;
- unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
-
- sp->len = READ_ONCE(sqe->len);
- sp->flags = READ_ONCE(sqe->splice_flags);
- if (unlikely(sp->flags & ~valid_flags))
- return -EINVAL;
- sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
- return 0;
-}
-
-static int io_tee_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
- return -EINVAL;
- return __io_splice_prep(req, sqe);
-}
-
-static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_splice *sp = &req->splice;
- struct file *out = sp->file_out;
- unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
- struct file *in;
- long ret = 0;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- if (sp->flags & SPLICE_F_FD_IN_FIXED)
- in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
- else
- in = io_file_get_normal(req, sp->splice_fd_in);
- if (!in) {
- ret = -EBADF;
- goto done;
- }
-
- if (sp->len)
- ret = do_tee(in, out, sp->len, flags);
-
- if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(in);
-done:
- if (ret != sp->len)
- req_set_fail(req);
- __io_req_complete(req, 0, ret, 0);
- return 0;
-}
-
-static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_splice *sp = &req->splice;
-
- sp->off_in = READ_ONCE(sqe->splice_off_in);
- sp->off_out = READ_ONCE(sqe->off);
- return __io_splice_prep(req, sqe);
-}
-
-static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_splice *sp = &req->splice;
- struct file *out = sp->file_out;
- unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
- loff_t *poff_in, *poff_out;
- struct file *in;
- long ret = 0;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- if (sp->flags & SPLICE_F_FD_IN_FIXED)
- in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
- else
- in = io_file_get_normal(req, sp->splice_fd_in);
- if (!in) {
- ret = -EBADF;
- goto done;
- }
-
- poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
- poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
-
- if (sp->len)
- ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
-
- if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(in);
-done:
- if (ret != sp->len)
- req_set_fail(req);
- __io_req_complete(req, 0, ret, 0);
- return 0;
-}
-
-static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- return 0;
-}
-
-/*
- * IORING_OP_NOP just posts a completion event, nothing else.
- */
-static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
-{
- __io_req_complete(req, issue_flags, 0, 0);
- return 0;
-}
-
-static int io_msg_ring_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
- sqe->buf_index || sqe->personality))
- return -EINVAL;
-
- req->msg.user_data = READ_ONCE(sqe->off);
- req->msg.len = READ_ONCE(sqe->len);
- return 0;
-}
-
-static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *target_ctx;
- struct io_msg *msg = &req->msg;
- bool filled;
- int ret;
-
- ret = -EBADFD;
- if (req->file->f_op != &io_uring_fops)
- goto done;
-
- ret = -EOVERFLOW;
- target_ctx = req->file->private_data;
-
- spin_lock(&target_ctx->completion_lock);
- filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
- io_commit_cqring(target_ctx);
- spin_unlock(&target_ctx->completion_lock);
-
- if (filled) {
- io_cqring_ev_posted(target_ctx);
- ret = 0;
- }
-
-done:
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- /* put file to avoid an attempt to IOPOLL the req */
- io_put_file(req->file);
- req->file = NULL;
- return 0;
-}
-
-static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
- return -EINVAL;
-
- req->sync.flags = READ_ONCE(sqe->fsync_flags);
- if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
- return -EINVAL;
-
- req->sync.off = READ_ONCE(sqe->off);
- req->sync.len = READ_ONCE(sqe->len);
- return 0;
-}
-
-static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
-{
- loff_t end = req->sync.off + req->sync.len;
- int ret;
-
- /* fsync always requires a blocking context */
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = vfs_fsync_range(req->file, req->sync.off,
- end > 0 ? end : LLONG_MAX,
- req->sync.flags & IORING_FSYNC_DATASYNC);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_fallocate_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
- return -EINVAL;
-
- req->sync.off = READ_ONCE(sqe->off);
- req->sync.len = READ_ONCE(sqe->addr);
- req->sync.mode = READ_ONCE(sqe->len);
- return 0;
-}
-
-static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
-{
- int ret;
-
- /* fallocate always requiring blocking context */
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
- ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
- req->sync.len);
- if (ret >= 0)
- fsnotify_modify(req->file);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- const char __user *fname;
- int ret;
-
- if (unlikely(sqe->buf_index))
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- /* open.how should be already initialised */
- if (!(req->open.how.flags & O_PATH) && force_o_largefile())
- req->open.how.flags |= O_LARGEFILE;
-
- req->open.dfd = READ_ONCE(sqe->fd);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->open.filename = getname(fname);
- if (IS_ERR(req->open.filename)) {
- ret = PTR_ERR(req->open.filename);
- req->open.filename = NULL;
- return ret;
- }
-
- req->open.file_slot = READ_ONCE(sqe->file_index);
- if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
- return -EINVAL;
-
- req->open.nofile = rlimit(RLIMIT_NOFILE);
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- u64 mode = READ_ONCE(sqe->len);
- u64 flags = READ_ONCE(sqe->open_flags);
-
- req->open.how = build_open_how(flags, mode);
- return __io_openat_prep(req, sqe);
-}
-
-static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct open_how __user *how;
- size_t len;
- int ret;
-
- how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- len = READ_ONCE(sqe->len);
- if (len < OPEN_HOW_SIZE_VER0)
- return -EINVAL;
-
- ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
- len);
- if (ret)
- return ret;
-
- return __io_openat_prep(req, sqe);
-}
-
-static int io_file_bitmap_get(struct io_ring_ctx *ctx)
-{
- struct io_file_table *table = &ctx->file_table;
- unsigned long nr = ctx->nr_user_files;
- int ret;
-
- do {
- ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
- if (ret != nr)
- return ret;
-
- if (!table->alloc_hint)
- break;
-
- nr = table->alloc_hint;
- table->alloc_hint = 0;
- } while (1);
-
- return -ENFILE;
-}
-
-/*
- * Note when io_fixed_fd_install() returns error value, it will ensure
- * fput() is called correspondingly.
- */
-static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
- struct file *file, unsigned int file_slot)
-{
- bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- io_ring_submit_lock(ctx, issue_flags);
-
- if (alloc_slot) {
- ret = io_file_bitmap_get(ctx);
- if (unlikely(ret < 0))
- goto err;
- file_slot = ret;
- } else {
- file_slot--;
- }
-
- ret = io_install_fixed_file(req, file, issue_flags, file_slot);
- if (!ret && alloc_slot)
- ret = file_slot;
-err:
- io_ring_submit_unlock(ctx, issue_flags);
- if (unlikely(ret < 0))
- fput(file);
- return ret;
-}
-
-static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct open_flags op;
- struct file *file;
- bool resolve_nonblock, nonblock_set;
- bool fixed = !!req->open.file_slot;
- int ret;
-
- ret = build_open_flags(&req->open.how, &op);
- if (ret)
- goto err;
- nonblock_set = op.open_flag & O_NONBLOCK;
- resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
- if (issue_flags & IO_URING_F_NONBLOCK) {
- /*
- * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
- * it'll always -EAGAIN
- */
- if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
- return -EAGAIN;
- op.lookup_flags |= LOOKUP_CACHED;
- op.open_flag |= O_NONBLOCK;
- }
-
- if (!fixed) {
- ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
- if (ret < 0)
- goto err;
- }
-
- file = do_filp_open(req->open.dfd, req->open.filename, &op);
- if (IS_ERR(file)) {
- /*
- * We could hang on to this 'fd' on retrying, but seems like
- * marginal gain for something that is now known to be a slower
- * path. So just put it, and we'll get a new one when we retry.
- */
- if (!fixed)
- put_unused_fd(ret);
-
- ret = PTR_ERR(file);
- /* only retry if RESOLVE_CACHED wasn't already set by application */
- if (ret == -EAGAIN &&
- (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
- return -EAGAIN;
- goto err;
- }
-
- if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
- file->f_flags &= ~O_NONBLOCK;
- fsnotify_open(file);
-
- if (!fixed)
- fd_install(ret, file);
- else
- ret = io_fixed_fd_install(req, issue_flags, file,
- req->open.file_slot);
-err:
- putname(req->open.filename);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
-{
- return io_openat2(req, issue_flags);
-}
-
-static int io_remove_buffers_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_provide_buf *p = &req->pbuf;
- u64 tmp;
-
- if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
- sqe->splice_fd_in)
- return -EINVAL;
-
- tmp = READ_ONCE(sqe->fd);
- if (!tmp || tmp > USHRT_MAX)
- return -EINVAL;
-
- memset(p, 0, sizeof(*p));
- p->nbufs = tmp;
- p->bgid = READ_ONCE(sqe->buf_group);
- return 0;
-}
-
-static int __io_remove_buffers(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned nbufs)
-{
- unsigned i = 0;
-
- /* shouldn't happen */
- if (!nbufs)
- return 0;
-
- if (bl->buf_nr_pages) {
- int j;
-
- i = bl->buf_ring->tail - bl->head;
- for (j = 0; j < bl->buf_nr_pages; j++)
- unpin_user_page(bl->buf_pages[j]);
- kvfree(bl->buf_pages);
- bl->buf_pages = NULL;
- bl->buf_nr_pages = 0;
- /* make sure it's seen as empty */
- INIT_LIST_HEAD(&bl->buf_list);
- return i;
- }
-
- /* the head kbuf is the list itself */
- while (!list_empty(&bl->buf_list)) {
- struct io_buffer *nxt;
-
- nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
- list_del(&nxt->list);
- if (++i == nbufs)
- return i;
- cond_resched();
- }
- i++;
-
- return i;
-}
-
-static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_provide_buf *p = &req->pbuf;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
-
- io_ring_submit_lock(ctx, issue_flags);
-
- ret = -ENOENT;
- bl = io_buffer_get_list(ctx, p->bgid);
- if (bl) {
- ret = -EINVAL;
- /* can't use provide/remove buffers command on mapped buffers */
- if (!bl->buf_nr_pages)
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
- }
- if (ret < 0)
- req_set_fail(req);
-
- /* complete before unlock, IOPOLL may need the lock */
- __io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, issue_flags);
- return 0;
-}
-
-static int io_provide_buffers_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- unsigned long size, tmp_check;
- struct io_provide_buf *p = &req->pbuf;
- u64 tmp;
-
- if (sqe->rw_flags || sqe->splice_fd_in)
- return -EINVAL;
-
- tmp = READ_ONCE(sqe->fd);
- if (!tmp || tmp > USHRT_MAX)
- return -E2BIG;
- p->nbufs = tmp;
- p->addr = READ_ONCE(sqe->addr);
- p->len = READ_ONCE(sqe->len);
-
- if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
- &size))
- return -EOVERFLOW;
- if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
- return -EOVERFLOW;
-
- size = (unsigned long)p->len * p->nbufs;
- if (!access_ok(u64_to_user_ptr(p->addr), size))
- return -EFAULT;
-
- p->bgid = READ_ONCE(sqe->buf_group);
- tmp = READ_ONCE(sqe->off);
- if (tmp > USHRT_MAX)
- return -E2BIG;
- p->bid = tmp;
- return 0;
-}
-
-static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
-{
- struct io_buffer *buf;
- struct page *page;
- int bufs_in_page;
-
- /*
- * Completions that don't happen inline (eg not under uring_lock) will
- * add to ->io_buffers_comp. If we don't have any free buffers, check
- * the completion list and splice those entries first.
- */
- if (!list_empty_careful(&ctx->io_buffers_comp)) {
- spin_lock(&ctx->completion_lock);
- if (!list_empty(&ctx->io_buffers_comp)) {
- list_splice_init(&ctx->io_buffers_comp,
- &ctx->io_buffers_cache);
- spin_unlock(&ctx->completion_lock);
- return 0;
- }
- spin_unlock(&ctx->completion_lock);
- }
-
- /*
- * No free buffers and no completion entries either. Allocate a new
- * page worth of buffer entries and add those to our freelist.
- */
- page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!page)
- return -ENOMEM;
-
- list_add(&page->lru, &ctx->io_buffers_pages);
-
- buf = page_address(page);
- bufs_in_page = PAGE_SIZE / sizeof(*buf);
- while (bufs_in_page) {
- list_add_tail(&buf->list, &ctx->io_buffers_cache);
- buf++;
- bufs_in_page--;
- }
-
- return 0;
-}
-
-static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
- struct io_buffer_list *bl)
-{
- struct io_buffer *buf;
- u64 addr = pbuf->addr;
- int i, bid = pbuf->bid;
-
- for (i = 0; i < pbuf->nbufs; i++) {
- if (list_empty(&ctx->io_buffers_cache) &&
- io_refill_buffer_cache(ctx))
- break;
- buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
- list);
- list_move_tail(&buf->list, &bl->buf_list);
- buf->addr = addr;
- buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
- buf->bid = bid;
- buf->bgid = pbuf->bgid;
- addr += pbuf->len;
- bid++;
- cond_resched();
- }
-
- return i ? 0 : -ENOMEM;
-}
-
-static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
-{
- int i;
-
- ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
- GFP_KERNEL);
- if (!ctx->io_bl)
- return -ENOMEM;
-
- for (i = 0; i < BGID_ARRAY; i++) {
- INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
- ctx->io_bl[i].bgid = i;
- }
-
- return 0;
-}
-
-static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_provide_buf *p = &req->pbuf;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
-
- io_ring_submit_lock(ctx, issue_flags);
-
- if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
- ret = io_init_bl_list(ctx);
- if (ret)
- goto err;
- }
-
- bl = io_buffer_get_list(ctx, p->bgid);
- if (unlikely(!bl)) {
- bl = kzalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl) {
- ret = -ENOMEM;
- goto err;
- }
- INIT_LIST_HEAD(&bl->buf_list);
- ret = io_buffer_add_list(ctx, bl, p->bgid);
- if (ret) {
- kfree(bl);
- goto err;
- }
- }
- /* can't add buffers via this command for a mapped buffer ring */
- if (bl->buf_nr_pages) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = io_add_buffers(ctx, p, bl);
-err:
- if (ret < 0)
- req_set_fail(req);
- /* complete before unlock, IOPOLL may need the lock */
- __io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, issue_flags);
- return 0;
-}
-
-static int io_epoll_ctl_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
-#if defined(CONFIG_EPOLL)
- if (sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
-
- req->epoll.epfd = READ_ONCE(sqe->fd);
- req->epoll.op = READ_ONCE(sqe->len);
- req->epoll.fd = READ_ONCE(sqe->off);
-
- if (ep_op_has_event(req->epoll.op)) {
- struct epoll_event __user *ev;
-
- ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
- if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
- return -EFAULT;
- }
-
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
-{
-#if defined(CONFIG_EPOLL)
- struct io_epoll *ie = &req->epoll;
- int ret;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
- ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
-
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
- return -EINVAL;
-
- req->madvise.addr = READ_ONCE(sqe->addr);
- req->madvise.len = READ_ONCE(sqe->len);
- req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
-{
-#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- struct io_madvise *ma = &req->madvise;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
- io_req_complete(req, ret);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
- return -EINVAL;
-
- req->fadvise.offset = READ_ONCE(sqe->off);
- req->fadvise.len = READ_ONCE(sqe->len);
- req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
- return 0;
-}
-
-static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_fadvise *fa = &req->fadvise;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK) {
- switch (fa->advice) {
- case POSIX_FADV_NORMAL:
- case POSIX_FADV_RANDOM:
- case POSIX_FADV_SEQUENTIAL:
- break;
- default:
- return -EAGAIN;
- }
- }
-
- ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- const char __user *path;
-
- if (sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
- return -EBADF;
-
- req->statx.dfd = READ_ONCE(sqe->fd);
- req->statx.mask = READ_ONCE(sqe->len);
- path = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- req->statx.flags = READ_ONCE(sqe->statx_flags);
-
- req->statx.filename = getname_flags(path,
- getname_statx_lookup_flags(req->statx.flags),
- NULL);
-
- if (IS_ERR(req->statx.filename)) {
- int ret = PTR_ERR(req->statx.filename);
-
- req->statx.filename = NULL;
- return ret;
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_statx *ctx = &req->statx;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
- ctx->buffer);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
- return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
- return -EBADF;
-
- req->close.fd = READ_ONCE(sqe->fd);
- req->close.file_slot = READ_ONCE(sqe->file_index);
- if (req->close.file_slot && req->close.fd)
- return -EINVAL;
-
- return 0;
-}
-
-static int io_close(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct files_struct *files = current->files;
- struct io_close *close = &req->close;
- struct fdtable *fdt;
- struct file *file;
- int ret = -EBADF;
-
- if (req->close.file_slot) {
- ret = io_close_fixed(req, issue_flags);
- goto err;
- }
-
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (close->fd >= fdt->max_fds) {
- spin_unlock(&files->file_lock);
- goto err;
- }
- file = rcu_dereference_protected(fdt->fd[close->fd],
- lockdep_is_held(&files->file_lock));
- if (!file || file->f_op == &io_uring_fops) {
- spin_unlock(&files->file_lock);
- goto err;
- }
-
- /* if the file has a flush method, be safe and punt to async */
- if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
- spin_unlock(&files->file_lock);
- return -EAGAIN;
- }
-
- file = __close_fd_get_file(close->fd);
- spin_unlock(&files->file_lock);
- if (!file)
- goto err;
-
- /* No ->flush() or already async, safely close from here */
- ret = filp_close(file, current->files);
-err:
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
- return -EINVAL;
-
- req->sync.off = READ_ONCE(sqe->off);
- req->sync.len = READ_ONCE(sqe->len);
- req->sync.flags = READ_ONCE(sqe->sync_range_flags);
- return 0;
-}
-
-static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
-{
- int ret;
-
- /* sync_file_range always requires a blocking context */
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = sync_file_range(req->file, req->sync.off, req->sync.len,
- req->sync.flags);
- io_req_complete(req, ret);
- return 0;
-}
-
-#if defined(CONFIG_NET)
-static int io_shutdown_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
- sqe->buf_index || sqe->splice_fd_in))
- return -EINVAL;
-
- req->shutdown.how = READ_ONCE(sqe->len);
- return 0;
-}
-
-static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct socket *sock;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- ret = __sys_shutdown_sock(sock, req->shutdown.how);
- io_req_complete(req, ret);
- return 0;
-}
-
-static bool io_net_retry(struct socket *sock, int flags)
-{
- if (!(flags & MSG_WAITALL))
- return false;
- return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
-}
-
-static int io_setup_async_msg(struct io_kiocb *req,
- struct io_async_msghdr *kmsg)
-{
- struct io_async_msghdr *async_msg = req->async_data;
-
- if (async_msg)
- return -EAGAIN;
- if (io_alloc_async_data(req)) {
- kfree(kmsg->free_iov);
- return -ENOMEM;
- }
- async_msg = req->async_data;
- req->flags |= REQ_F_NEED_CLEANUP;
- memcpy(async_msg, kmsg, sizeof(*kmsg));
- async_msg->msg.msg_name = &async_msg->addr;
- /* if were using fast_iov, set it to the new one */
- if (!async_msg->free_iov)
- async_msg->msg.msg_iter.iov = async_msg->fast_iov;
-
- return -EAGAIN;
-}
-
-static int io_sendmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- iomsg->msg.msg_name = &iomsg->addr;
- iomsg->free_iov = iomsg->fast_iov;
- return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
- req->sr_msg.msg_flags, &iomsg->free_iov);
-}
-
-static int io_sendmsg_prep_async(struct io_kiocb *req)
-{
- int ret;
-
- ret = io_sendmsg_copy_hdr(req, req->async_data);
- if (!ret)
- req->flags |= REQ_F_NEED_CLEANUP;
- return ret;
-}
-
-static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_sr_msg *sr = &req->sr_msg;
-
- if (unlikely(sqe->file_index))
- return -EINVAL;
-
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- sr->len = READ_ONCE(sqe->len);
- sr->flags = READ_ONCE(sqe->addr2);
- if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
- return -EINVAL;
- sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
- if (sr->msg_flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
-
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- sr->msg_flags |= MSG_CMSG_COMPAT;
-#endif
- sr->done_io = 0;
- return 0;
-}
-
-static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_async_msghdr iomsg, *kmsg;
- struct io_sr_msg *sr = &req->sr_msg;
- struct socket *sock;
- unsigned flags;
- int min_ret = 0;
- int ret;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- } else {
- ret = io_sendmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
- if (!(req->flags & REQ_F_POLLED) &&
- (sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_msg(req, kmsg);
-
- flags = sr->msg_flags;
- if (issue_flags & IO_URING_F_NONBLOCK)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&kmsg->msg.msg_iter);
-
- ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
-
- if (ret < min_ret) {
- if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret > 0 && io_net_retry(sock, flags)) {
- sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
- return io_setup_async_msg(req, kmsg);
- }
- req_set_fail(req);
- }
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret >= 0)
- ret += sr->done_io;
- else if (sr->done_io)
- ret = sr->done_io;
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_send(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_sr_msg *sr = &req->sr_msg;
- struct msghdr msg;
- struct iovec iov;
- struct socket *sock;
- unsigned flags;
- int min_ret = 0;
- int ret;
-
- if (!(req->flags & REQ_F_POLLED) &&
- (sr->flags & IORING_RECVSEND_POLL_FIRST))
- return -EAGAIN;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
- if (unlikely(ret))
- return ret;
-
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
-
- flags = sr->msg_flags;
- if (issue_flags & IO_URING_F_NONBLOCK)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
-
- msg.msg_flags = flags;
- ret = sock_sendmsg(sock, &msg);
- if (ret < min_ret) {
- if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret > 0 && io_net_retry(sock, flags)) {
- sr->len -= ret;
- sr->buf += ret;
- sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
- return -EAGAIN;
- }
- req_set_fail(req);
- }
- if (ret >= 0)
- ret += sr->done_io;
- else if (sr->done_io)
- ret = sr->done_io;
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- struct io_sr_msg *sr = &req->sr_msg;
- struct iovec __user *uiov;
- size_t iov_len;
- int ret;
-
- ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
- &iomsg->uaddr, &uiov, &iov_len);
- if (ret)
- return ret;
-
- if (req->flags & REQ_F_BUFFER_SELECT) {
- if (iov_len > 1)
- return -EINVAL;
- if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
- return -EFAULT;
- sr->len = iomsg->fast_iov[0].iov_len;
- iomsg->free_iov = NULL;
- } else {
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
- &iomsg->free_iov, &iomsg->msg.msg_iter,
- false);
- if (ret > 0)
- ret = 0;
- }
-
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- struct io_sr_msg *sr = &req->sr_msg;
- struct compat_iovec __user *uiov;
- compat_uptr_t ptr;
- compat_size_t len;
- int ret;
-
- ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
- &ptr, &len);
- if (ret)
- return ret;
-
- uiov = compat_ptr(ptr);
- if (req->flags & REQ_F_BUFFER_SELECT) {
- compat_ssize_t clen;
-
- if (len > 1)
- return -EINVAL;
- if (!access_ok(uiov, sizeof(*uiov)))
- return -EFAULT;
- if (__get_user(clen, &uiov->iov_len))
- return -EFAULT;
- if (clen < 0)
- return -EINVAL;
- sr->len = clen;
- iomsg->free_iov = NULL;
- } else {
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
- UIO_FASTIOV, &iomsg->free_iov,
- &iomsg->msg.msg_iter, true);
- if (ret < 0)
- return ret;
- }
-
- return 0;
-}
-#endif
-
-static int io_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- iomsg->msg.msg_name = &iomsg->addr;
-
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return __io_compat_recvmsg_copy_hdr(req, iomsg);
-#endif
-
- return __io_recvmsg_copy_hdr(req, iomsg);
-}
-
-static int io_recvmsg_prep_async(struct io_kiocb *req)
-{
- int ret;
-
- ret = io_recvmsg_copy_hdr(req, req->async_data);
- if (!ret)
- req->flags |= REQ_F_NEED_CLEANUP;
- return ret;
-}
-
-static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_sr_msg *sr = &req->sr_msg;
-
- if (unlikely(sqe->file_index))
- return -EINVAL;
-
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- sr->len = READ_ONCE(sqe->len);
- sr->flags = READ_ONCE(sqe->addr2);
- if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
- return -EINVAL;
- sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
- if (sr->msg_flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
-
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- sr->msg_flags |= MSG_CMSG_COMPAT;
-#endif
- sr->done_io = 0;
- return 0;
-}
-
-static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_async_msghdr iomsg, *kmsg;
- struct io_sr_msg *sr = &req->sr_msg;
- struct socket *sock;
- unsigned int cflags;
- unsigned flags;
- int ret, min_ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- } else {
- ret = io_recvmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
- if (!(req->flags & REQ_F_POLLED) &&
- (sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_msg(req, kmsg);
-
- if (io_do_buffer_select(req)) {
- void __user *buf;
-
- buf = io_buffer_select(req, &sr->len, issue_flags);
- if (!buf)
- return -ENOBUFS;
- kmsg->fast_iov[0].iov_base = buf;
- kmsg->fast_iov[0].iov_len = sr->len;
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
- sr->len);
- }
-
- flags = sr->msg_flags;
- if (force_nonblock)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&kmsg->msg.msg_iter);
-
- kmsg->msg.msg_get_inq = 1;
- ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
- if (ret < min_ret) {
- if (ret == -EAGAIN && force_nonblock)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret > 0 && io_net_retry(sock, flags)) {
- sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
- return io_setup_async_msg(req, kmsg);
- }
- req_set_fail(req);
- } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
- req_set_fail(req);
- }
-
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret >= 0)
- ret += sr->done_io;
- else if (sr->done_io)
- ret = sr->done_io;
- cflags = io_put_kbuf(req, issue_flags);
- if (kmsg->msg.msg_inq)
- cflags |= IORING_CQE_F_SOCK_NONEMPTY;
- __io_req_complete(req, issue_flags, ret, cflags);
- return 0;
-}
-
-static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_sr_msg *sr = &req->sr_msg;
- struct msghdr msg;
- struct socket *sock;
- struct iovec iov;
- unsigned int cflags;
- unsigned flags;
- int ret, min_ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
- if (!(req->flags & REQ_F_POLLED) &&
- (sr->flags & IORING_RECVSEND_POLL_FIRST))
- return -EAGAIN;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- if (io_do_buffer_select(req)) {
- void __user *buf;
-
- buf = io_buffer_select(req, &sr->len, issue_flags);
- if (!buf)
- return -ENOBUFS;
- sr->buf = buf;
- }
-
- ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
- if (unlikely(ret))
- goto out_free;
-
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_get_inq = 1;
- msg.msg_flags = 0;
- msg.msg_controllen = 0;
- msg.msg_iocb = NULL;
-
- flags = sr->msg_flags;
- if (force_nonblock)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
-
- ret = sock_recvmsg(sock, &msg, flags);
- if (ret < min_ret) {
- if (ret == -EAGAIN && force_nonblock)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret > 0 && io_net_retry(sock, flags)) {
- sr->len -= ret;
- sr->buf += ret;
- sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
- return -EAGAIN;
- }
- req_set_fail(req);
- } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
-out_free:
- req_set_fail(req);
- }
-
- if (ret >= 0)
- ret += sr->done_io;
- else if (sr->done_io)
- ret = sr->done_io;
- cflags = io_put_kbuf(req, issue_flags);
- if (msg.msg_inq)
- cflags |= IORING_CQE_F_SOCK_NONEMPTY;
- __io_req_complete(req, issue_flags, ret, cflags);
- return 0;
-}
-
-static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_accept *accept = &req->accept;
- unsigned flags;
-
- if (sqe->len || sqe->buf_index)
- return -EINVAL;
-
- accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
- accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- accept->flags = READ_ONCE(sqe->accept_flags);
- accept->nofile = rlimit(RLIMIT_NOFILE);
- flags = READ_ONCE(sqe->ioprio);
- if (flags & ~IORING_ACCEPT_MULTISHOT)
- return -EINVAL;
-
- accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot) {
- if (accept->flags & SOCK_CLOEXEC)
- return -EINVAL;
- if (flags & IORING_ACCEPT_MULTISHOT &&
- accept->file_slot != IORING_FILE_INDEX_ALLOC)
- return -EINVAL;
- }
- if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
- return -EINVAL;
- if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
- accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
- if (flags & IORING_ACCEPT_MULTISHOT)
- req->flags |= REQ_F_APOLL_MULTISHOT;
- return 0;
-}
-
-static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_accept *accept = &req->accept;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
- bool fixed = !!accept->file_slot;
- struct file *file;
- int ret, fd;
-
-retry:
- if (!fixed) {
- fd = __get_unused_fd_flags(accept->flags, accept->nofile);
- if (unlikely(fd < 0))
- return fd;
- }
- file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
- accept->flags);
- if (IS_ERR(file)) {
- if (!fixed)
- put_unused_fd(fd);
- ret = PTR_ERR(file);
- if (ret == -EAGAIN && force_nonblock) {
- /*
- * if it's multishot and polled, we don't need to
- * return EAGAIN to arm the poll infra since it
- * has already been done
- */
- if ((req->flags & IO_APOLL_MULTI_POLLED) ==
- IO_APOLL_MULTI_POLLED)
- ret = 0;
- return ret;
- }
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- req_set_fail(req);
- } else if (!fixed) {
- fd_install(fd, file);
- ret = fd;
- } else {
- ret = io_fixed_fd_install(req, issue_flags, file,
- accept->file_slot);
- }
-
- if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
- }
- if (ret >= 0) {
- bool filled;
-
- spin_lock(&ctx->completion_lock);
- filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
- IORING_CQE_F_MORE);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (filled) {
- io_cqring_ev_posted(ctx);
- goto retry;
- }
- ret = -ECANCELED;
- }
-
- return ret;
-}
-
-static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_socket *sock = &req->sock;
-
- if (sqe->addr || sqe->rw_flags || sqe->buf_index)
- return -EINVAL;
-
- sock->domain = READ_ONCE(sqe->fd);
- sock->type = READ_ONCE(sqe->off);
- sock->protocol = READ_ONCE(sqe->len);
- sock->file_slot = READ_ONCE(sqe->file_index);
- sock->nofile = rlimit(RLIMIT_NOFILE);
-
- sock->flags = sock->type & ~SOCK_TYPE_MASK;
- if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
- return -EINVAL;
- if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
- return -EINVAL;
- return 0;
-}
-
-static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_socket *sock = &req->sock;
- bool fixed = !!sock->file_slot;
- struct file *file;
- int ret, fd;
-
- if (!fixed) {
- fd = __get_unused_fd_flags(sock->flags, sock->nofile);
- if (unlikely(fd < 0))
- return fd;
- }
- file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
- if (IS_ERR(file)) {
- if (!fixed)
- put_unused_fd(fd);
- ret = PTR_ERR(file);
- if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- req_set_fail(req);
- } else if (!fixed) {
- fd_install(fd, file);
- ret = fd;
- } else {
- ret = io_fixed_fd_install(req, issue_flags, file,
- sock->file_slot);
- }
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_connect_prep_async(struct io_kiocb *req)
-{
- struct io_async_connect *io = req->async_data;
- struct io_connect *conn = &req->connect;
-
- return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
-}
-
-static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_connect *conn = &req->connect;
-
- if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
- return -EINVAL;
-
- conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
- conn->addr_len = READ_ONCE(sqe->addr2);
- return 0;
-}
-
-static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_async_connect __io, *io;
- unsigned file_flags;
- int ret;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
- if (req_has_async_data(req)) {
- io = req->async_data;
- } else {
- ret = move_addr_to_kernel(req->connect.addr,
- req->connect.addr_len,
- &__io.address);
- if (ret)
- goto out;
- io = &__io;
- }
-
- file_flags = force_nonblock ? O_NONBLOCK : 0;
-
- ret = __sys_connect_file(req->file, &io->address,
- req->connect.addr_len, file_flags);
- if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req_has_async_data(req))
- return -EAGAIN;
- if (io_alloc_async_data(req)) {
- ret = -ENOMEM;
- goto out;
- }
- memcpy(req->async_data, &__io, sizeof(__io));
- return -EAGAIN;
- }
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
-out:
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-#else /* !CONFIG_NET */
-#define IO_NETOP_FN(op) \
-static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
-{ \
- return -EOPNOTSUPP; \
-}
-
-#define IO_NETOP_PREP(op) \
-IO_NETOP_FN(op) \
-static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
-{ \
- return -EOPNOTSUPP; \
-} \
-
-#define IO_NETOP_PREP_ASYNC(op) \
-IO_NETOP_PREP(op) \
-static int io_##op##_prep_async(struct io_kiocb *req) \
-{ \
- return -EOPNOTSUPP; \
-}
-
-IO_NETOP_PREP_ASYNC(sendmsg);
-IO_NETOP_PREP_ASYNC(recvmsg);
-IO_NETOP_PREP_ASYNC(connect);
-IO_NETOP_PREP(accept);
-IO_NETOP_PREP(socket);
-IO_NETOP_PREP(shutdown);
-IO_NETOP_FN(send);
-IO_NETOP_FN(recv);
-#endif /* CONFIG_NET */
-
-struct io_poll_table {
- struct poll_table_struct pt;
- struct io_kiocb *req;
- int nr_entries;
- int error;
-};
-
-#define IO_POLL_CANCEL_FLAG BIT(31)
-#define IO_POLL_REF_MASK GENMASK(30, 0)
-
-/*
- * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
- * bump it and acquire ownership. It's disallowed to modify requests while not
- * owning it, that prevents from races for enqueueing task_work's and b/w
- * arming poll and wakeups.
- */
-static inline bool io_poll_get_ownership(struct io_kiocb *req)
-{
- return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
-}
-
-static void io_poll_mark_cancelled(struct io_kiocb *req)
-{
- atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
-}
-
-static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
-{
- /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
- if (req->opcode == IORING_OP_POLL_ADD)
- return req->async_data;
- return req->apoll->double_poll;
-}
-
-static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
-{
- if (req->opcode == IORING_OP_POLL_ADD)
- return &req->poll;
- return &req->apoll->poll;
-}
-
-static void io_poll_req_insert(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
- hlist_add_head(&req->hash_node, list);
-}
-
-static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
- wait_queue_func_t wake_func)
-{
- poll->head = NULL;
-#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
- /* mask in events that we always want/need */
- poll->events = events | IO_POLL_UNMASK;
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, wake_func);
-}
-
-static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
-{
- struct wait_queue_head *head = smp_load_acquire(&poll->head);
-
- if (head) {
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
- }
-}
-
-static void io_poll_remove_entries(struct io_kiocb *req)
-{
- /*
- * Nothing to do if neither of those flags are set. Avoid dipping
- * into the poll/apoll/double cachelines if we can.
- */
- if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
- return;
-
- /*
- * While we hold the waitqueue lock and the waitqueue is nonempty,
- * wake_up_pollfree() will wait for us. However, taking the waitqueue
- * lock in the first place can race with the waitqueue being freed.
- *
- * We solve this as eventpoll does: by taking advantage of the fact that
- * all users of wake_up_pollfree() will RCU-delay the actual free. If
- * we enter rcu_read_lock() and see that the pointer to the queue is
- * non-NULL, we can then lock it without the memory being freed out from
- * under us.
- *
- * Keep holding rcu_read_lock() as long as we hold the queue lock, in
- * case the caller deletes the entry from the queue, leaving it empty.
- * In that case, only RCU prevents the queue memory from being freed.
- */
- rcu_read_lock();
- if (req->flags & REQ_F_SINGLE_POLL)
- io_poll_remove_entry(io_poll_get_single(req));
- if (req->flags & REQ_F_DOUBLE_POLL)
- io_poll_remove_entry(io_poll_get_double(req));
- rcu_read_unlock();
-}
-
-static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);
-/*
- * All poll tw should go through this. Checks for poll events, manages
- * references, does rewait, etc.
- *
- * Returns a negative error on failure. >0 when no action require, which is
- * either spurious wakeup or multishot CQE is served. 0 when it's done with
- * the request, then the mask is stored in req->cqe.res.
- */
-static int io_poll_check_events(struct io_kiocb *req, bool *locked)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int v, ret;
-
- /* req->task == current here, checking PF_EXITING is safe */
- if (unlikely(req->task->flags & PF_EXITING))
- return -ECANCELED;
-
- do {
- v = atomic_read(&req->poll_refs);
-
- /* tw handler should be the owner, and so have some references */
- if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
- return 0;
- if (v & IO_POLL_CANCEL_FLAG)
- return -ECANCELED;
-
- if (!req->cqe.res) {
- struct poll_table_struct pt = { ._key = req->apoll_events };
- req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
- }
-
- if ((unlikely(!req->cqe.res)))
- continue;
- if (req->apoll_events & EPOLLONESHOT)
- return 0;
-
- /* multishot, just fill a CQE and proceed */
- if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
- __poll_t mask = mangle_poll(req->cqe.res &
- req->apoll_events);
- bool filled;
-
- spin_lock(&ctx->completion_lock);
- filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
- mask, IORING_CQE_F_MORE);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (filled) {
- io_cqring_ev_posted(ctx);
- continue;
- }
- return -ECANCELED;
- }
-
- io_tw_lock(req->ctx, locked);
- if (unlikely(req->task->flags & PF_EXITING))
- return -EFAULT;
- ret = io_issue_sqe(req,
- IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
- if (ret)
- return ret;
-
- /*
- * Release all references, retry if someone tried to restart
- * task_work while we were executing it.
- */
- } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
-
- return 1;
-}
-
-static void io_poll_task_func(struct io_kiocb *req, bool *locked)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- ret = io_poll_check_events(req, locked);
- if (ret > 0)
- return;
-
- if (!ret) {
- req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
- } else {
- req->cqe.res = ret;
- req_set_fail(req);
- }
-
- io_poll_remove_entries(req);
- spin_lock(&ctx->completion_lock);
- hash_del(&req->hash_node);
- __io_req_complete_post(req, req->cqe.res, 0);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-}
-
-static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- ret = io_poll_check_events(req, locked);
- if (ret > 0)
- return;
-
- io_poll_remove_entries(req);
- spin_lock(&ctx->completion_lock);
- hash_del(&req->hash_node);
- spin_unlock(&ctx->completion_lock);
-
- if (!ret)
- io_req_task_submit(req, locked);
- else
- io_req_complete_failed(req, ret);
-}
-
-static void __io_poll_execute(struct io_kiocb *req, int mask,
- __poll_t __maybe_unused events)
-{
- req->cqe.res = mask;
- /*
- * This is useful for poll that is armed on behalf of another
- * request, and where the wakeup path could be on a different
- * CPU. We want to avoid pulling in req->apoll->events for that
- * case.
- */
- if (req->opcode == IORING_OP_POLL_ADD)
- req->io_task_work.func = io_poll_task_func;
- else
- req->io_task_work.func = io_apoll_task_func;
-
- trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
- io_req_task_work_add(req);
-}
-
-static inline void io_poll_execute(struct io_kiocb *req, int res,
- __poll_t events)
-{
- if (io_poll_get_ownership(req))
- __io_poll_execute(req, res, events);
-}
-
-static void io_poll_cancel_req(struct io_kiocb *req)
-{
- io_poll_mark_cancelled(req);
- /* kick tw, which should complete the request */
- io_poll_execute(req, 0, 0);
-}
-
-#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
-#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
-#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI)
-
-static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
- void *key)
-{
- struct io_kiocb *req = wqe_to_req(wait);
- struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
- wait);
- __poll_t mask = key_to_poll(key);
-
- if (unlikely(mask & POLLFREE)) {
- io_poll_mark_cancelled(req);
- /* we have to kick tw in case it's not already */
- io_poll_execute(req, 0, poll->events);
-
- /*
- * If the waitqueue is being freed early but someone is already
- * holds ownership over it, we have to tear down the request as
- * best we can. That means immediately removing the request from
- * its waitqueue and preventing all further accesses to the
- * waitqueue via the request.
- */
- list_del_init(&poll->wait.entry);
-
- /*
- * Careful: this *must* be the last step, since as soon
- * as req->head is NULL'ed out, the request can be
- * completed and freed, since aio_poll_complete_work()
- * will no longer need to take the waitqueue lock.
- */
- smp_store_release(&poll->head, NULL);
- return 1;
- }
-
- /* for instances that support it check for an event match first */
- if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
- return 0;
-
- if (io_poll_get_ownership(req)) {
- /* optional, saves extra locking for removal in tw handler */
- if (mask && poll->events & EPOLLONESHOT) {
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- if (wqe_is_double(wait))
- req->flags &= ~REQ_F_DOUBLE_POLL;
- else
- req->flags &= ~REQ_F_SINGLE_POLL;
- }
- __io_poll_execute(req, mask, poll->events);
- }
- return 1;
-}
-
-static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
- struct wait_queue_head *head,
- struct io_poll_iocb **poll_ptr)
-{
- struct io_kiocb *req = pt->req;
- unsigned long wqe_private = (unsigned long) req;
-
- /*
- * The file being polled uses multiple waitqueues for poll handling
- * (e.g. one for read, one for write). Setup a separate io_poll_iocb
- * if this happens.
- */
- if (unlikely(pt->nr_entries)) {
- struct io_poll_iocb *first = poll;
-
- /* double add on the same waitqueue head, ignore */
- if (first->head == head)
- return;
- /* already have a 2nd entry, fail a third attempt */
- if (*poll_ptr) {
- if ((*poll_ptr)->head == head)
- return;
- pt->error = -EINVAL;
- return;
- }
-
- poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
- if (!poll) {
- pt->error = -ENOMEM;
- return;
- }
- /* mark as double wq entry */
- wqe_private |= 1;
- req->flags |= REQ_F_DOUBLE_POLL;
- io_init_poll_iocb(poll, first->events, first->wait.func);
- *poll_ptr = poll;
- if (req->opcode == IORING_OP_POLL_ADD)
- req->flags |= REQ_F_ASYNC_DATA;
- }
-
- req->flags |= REQ_F_SINGLE_POLL;
- pt->nr_entries++;
- poll->head = head;
- poll->wait.private = (void *) wqe_private;
-
- if (poll->events & EPOLLEXCLUSIVE)
- add_wait_queue_exclusive(head, &poll->wait);
- else
- add_wait_queue(head, &poll->wait);
-}
-
-static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
- struct poll_table_struct *p)
-{
- struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-
- __io_queue_proc(&pt->req->poll, pt, head,
- (struct io_poll_iocb **) &pt->req->async_data);
-}
-
-static int __io_arm_poll_handler(struct io_kiocb *req,
- struct io_poll_iocb *poll,
- struct io_poll_table *ipt, __poll_t mask)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int v;
-
- INIT_HLIST_NODE(&req->hash_node);
- req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
- io_init_poll_iocb(poll, mask, io_poll_wake);
- poll->file = req->file;
-
- req->apoll_events = poll->events;
-
- ipt->pt._key = mask;
- ipt->req = req;
- ipt->error = 0;
- ipt->nr_entries = 0;
-
- /*
- * Take the ownership to delay any tw execution up until we're done
- * with poll arming. see io_poll_get_ownership().
- */
- atomic_set(&req->poll_refs, 1);
- mask = vfs_poll(req->file, &ipt->pt) & poll->events;
-
- if (mask && (poll->events & EPOLLONESHOT)) {
- io_poll_remove_entries(req);
- /* no one else has access to the req, forget about the ref */
- return mask;
- }
- if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
- io_poll_remove_entries(req);
- if (!ipt->error)
- ipt->error = -EINVAL;
- return 0;
- }
-
- spin_lock(&ctx->completion_lock);
- io_poll_req_insert(req);
- spin_unlock(&ctx->completion_lock);
-
- if (mask) {
- /* can't multishot if failed, just queue the event we've got */
- if (unlikely(ipt->error || !ipt->nr_entries)) {
- poll->events |= EPOLLONESHOT;
- req->apoll_events |= EPOLLONESHOT;
- ipt->error = 0;
- }
- __io_poll_execute(req, mask, poll->events);
- return 0;
- }
-
- /*
- * Release ownership. If someone tried to queue a tw while it was
- * locked, kick it off for them.
- */
- v = atomic_dec_return(&req->poll_refs);
- if (unlikely(v & IO_POLL_REF_MASK))
- __io_poll_execute(req, 0, poll->events);
- return 0;
-}
-
-static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
- struct poll_table_struct *p)
-{
- struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- struct async_poll *apoll = pt->req->apoll;
-
- __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
-}
-
-enum {
- IO_APOLL_OK,
- IO_APOLL_ABORTED,
- IO_APOLL_READY
-};
-
-static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- struct io_ring_ctx *ctx = req->ctx;
- struct async_poll *apoll;
- struct io_poll_table ipt;
- __poll_t mask = POLLPRI | POLLERR;
- int ret;
-
- if (!def->pollin && !def->pollout)
- return IO_APOLL_ABORTED;
- if (!file_can_poll(req->file))
- return IO_APOLL_ABORTED;
- if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
- return IO_APOLL_ABORTED;
- if (!(req->flags & REQ_F_APOLL_MULTISHOT))
- mask |= EPOLLONESHOT;
-
- if (def->pollin) {
- mask |= EPOLLIN | EPOLLRDNORM;
-
- /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
- if ((req->opcode == IORING_OP_RECVMSG) &&
- (req->sr_msg.msg_flags & MSG_ERRQUEUE))
- mask &= ~EPOLLIN;
- } else {
- mask |= EPOLLOUT | EPOLLWRNORM;
- }
- if (def->poll_exclusive)
- mask |= EPOLLEXCLUSIVE;
- if (req->flags & REQ_F_POLLED) {
- apoll = req->apoll;
- kfree(apoll->double_poll);
- } else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
- !list_empty(&ctx->apoll_cache)) {
- apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
- poll.wait.entry);
- list_del_init(&apoll->poll.wait.entry);
- } else {
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
- return IO_APOLL_ABORTED;
- }
- apoll->double_poll = NULL;
- req->apoll = apoll;
- req->flags |= REQ_F_POLLED;
- ipt.pt._qproc = io_async_queue_proc;
-
- io_kbuf_recycle(req, issue_flags);
-
- ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
- if (ret || ipt.error)
- return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
-
- trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
- mask, apoll->poll.events);
- return IO_APOLL_OK;
-}
-
-/*
- * Returns true if we found and killed one or more poll requests
- */
-static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
- struct task_struct *tsk, bool cancel_all)
-{
- struct hlist_node *tmp;
- struct io_kiocb *req;
- bool found = false;
- int i;
-
- spin_lock(&ctx->completion_lock);
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_match_task_safe(req, tsk, cancel_all)) {
- hlist_del_init(&req->hash_node);
- io_poll_cancel_req(req);
- found = true;
- }
- }
- }
- spin_unlock(&ctx->completion_lock);
- return found;
-}
-
-static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
- struct io_cancel_data *cd)
- __must_hold(&ctx->completion_lock)
-{
- struct hlist_head *list;
- struct io_kiocb *req;
-
- list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
- hlist_for_each_entry(req, list, hash_node) {
- if (cd->data != req->cqe.user_data)
- continue;
- if (poll_only && req->opcode != IORING_OP_POLL_ADD)
- continue;
- if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
- if (cd->seq == req->work.cancel_seq)
- continue;
- req->work.cancel_seq = cd->seq;
- }
- return req;
- }
- return NULL;
-}
-
-static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
- struct io_cancel_data *cd)
- __must_hold(&ctx->completion_lock)
-{
- struct io_kiocb *req;
- int i;
-
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[i];
- hlist_for_each_entry(req, list, hash_node) {
- if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
- req->file != cd->file)
- continue;
- if (cd->seq == req->work.cancel_seq)
- continue;
- req->work.cancel_seq = cd->seq;
- return req;
- }
- }
- return NULL;
-}
-
-static bool io_poll_disarm(struct io_kiocb *req)
- __must_hold(&ctx->completion_lock)
-{
- if (!io_poll_get_ownership(req))
- return false;
- io_poll_remove_entries(req);
- hash_del(&req->hash_node);
- return true;
-}
-
-static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
- __must_hold(&ctx->completion_lock)
-{
- struct io_kiocb *req;
-
- if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
- req = io_poll_file_find(ctx, cd);
- else
- req = io_poll_find(ctx, false, cd);
- if (!req)
- return -ENOENT;
- io_poll_cancel_req(req);
- return 0;
-}
-
-static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
- unsigned int flags)
-{
- u32 events;
-
- events = READ_ONCE(sqe->poll32_events);
-#ifdef __BIG_ENDIAN
- events = swahw32(events);
-#endif
- if (!(flags & IORING_POLL_ADD_MULTI))
- events |= EPOLLONESHOT;
- return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
-}
-
-static int io_poll_remove_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_poll_update *upd = &req->poll_update;
- u32 flags;
-
- if (sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- flags = READ_ONCE(sqe->len);
- if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
- IORING_POLL_ADD_MULTI))
- return -EINVAL;
- /* meaningless without update */
- if (flags == IORING_POLL_ADD_MULTI)
- return -EINVAL;
-
- upd->old_user_data = READ_ONCE(sqe->addr);
- upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
- upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
-
- upd->new_user_data = READ_ONCE(sqe->off);
- if (!upd->update_user_data && upd->new_user_data)
- return -EINVAL;
- if (upd->update_events)
- upd->events = io_poll_parse_events(sqe, flags);
- else if (sqe->poll32_events)
- return -EINVAL;
-
- return 0;
-}
-
-static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_poll_iocb *poll = &req->poll;
- u32 flags;
-
- if (sqe->buf_index || sqe->off || sqe->addr)
- return -EINVAL;
- flags = READ_ONCE(sqe->len);
- if (flags & ~IORING_POLL_ADD_MULTI)
- return -EINVAL;
- if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
- return -EINVAL;
-
- io_req_set_refcount(req);
- poll->events = io_poll_parse_events(sqe, flags);
- return 0;
-}
-
-static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_poll_iocb *poll = &req->poll;
- struct io_poll_table ipt;
- int ret;
-
- ipt.pt._qproc = io_poll_queue_proc;
-
- ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
- if (!ret && ipt.error)
- req_set_fail(req);
- ret = ret ?: ipt.error;
- if (ret)
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *preq;
- int ret2, ret = 0;
- bool locked;
-
- spin_lock(&ctx->completion_lock);
- preq = io_poll_find(ctx, true, &cd);
- if (!preq || !io_poll_disarm(preq)) {
- spin_unlock(&ctx->completion_lock);
- ret = preq ? -EALREADY : -ENOENT;
- goto out;
- }
- spin_unlock(&ctx->completion_lock);
-
- if (req->poll_update.update_events || req->poll_update.update_user_data) {
- /* only mask one event flags, keep behavior flags */
- if (req->poll_update.update_events) {
- preq->poll.events &= ~0xffff;
- preq->poll.events |= req->poll_update.events & 0xffff;
- preq->poll.events |= IO_POLL_UNMASK;
- }
- if (req->poll_update.update_user_data)
- preq->cqe.user_data = req->poll_update.new_user_data;
-
- ret2 = io_poll_add(preq, issue_flags);
- /* successfully updated, don't complete poll request */
- if (!ret2)
- goto out;
- }
-
- req_set_fail(preq);
- preq->cqe.res = -ECANCELED;
- locked = !(issue_flags & IO_URING_F_UNLOCKED);
- io_req_task_complete(preq, &locked);
-out:
- if (ret < 0)
- req_set_fail(req);
- /* complete update request, we're done with it */
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
-{
- struct io_timeout_data *data = container_of(timer,
- struct io_timeout_data, timer);
- struct io_kiocb *req = data->req;
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->timeout_lock, flags);
- list_del_init(&req->timeout.list);
- atomic_set(&req->ctx->cq_timeouts,
- atomic_read(&req->ctx->cq_timeouts) + 1);
- spin_unlock_irqrestore(&ctx->timeout_lock, flags);
-
- if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
- req_set_fail(req);
-
- req->cqe.res = -ETIME;
- req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req);
- return HRTIMER_NORESTART;
-}
-
-static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
- struct io_cancel_data *cd)
- __must_hold(&ctx->timeout_lock)
-{
- struct io_timeout_data *io;
- struct io_kiocb *req;
- bool found = false;
-
- list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
- cd->data != req->cqe.user_data)
- continue;
- if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
- if (cd->seq == req->work.cancel_seq)
- continue;
- req->work.cancel_seq = cd->seq;
- }
- found = true;
- break;
- }
- if (!found)
- return ERR_PTR(-ENOENT);
-
- io = req->async_data;
- if (hrtimer_try_to_cancel(&io->timer) == -1)
- return ERR_PTR(-EALREADY);
- list_del_init(&req->timeout.list);
- return req;
-}
-
-static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
- __must_hold(&ctx->completion_lock)
-{
- struct io_kiocb *req;
-
- spin_lock_irq(&ctx->timeout_lock);
- req = io_timeout_extract(ctx, cd);
- spin_unlock_irq(&ctx->timeout_lock);
-
- if (IS_ERR(req))
- return PTR_ERR(req);
- io_req_task_queue_fail(req, -ECANCELED);
- return 0;
-}
-
-static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
-{
- switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
- case IORING_TIMEOUT_BOOTTIME:
- return CLOCK_BOOTTIME;
- case IORING_TIMEOUT_REALTIME:
- return CLOCK_REALTIME;
- default:
- /* can't happen, vetted at prep time */
- WARN_ON_ONCE(1);
- fallthrough;
- case 0:
- return CLOCK_MONOTONIC;
- }
-}
-
-static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
- struct timespec64 *ts, enum hrtimer_mode mode)
- __must_hold(&ctx->timeout_lock)
-{
- struct io_timeout_data *io;
- struct io_kiocb *req;
- bool found = false;
-
- list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
- found = user_data == req->cqe.user_data;
- if (found)
- break;
- }
- if (!found)
- return -ENOENT;
-
- io = req->async_data;
- if (hrtimer_try_to_cancel(&io->timer) == -1)
- return -EALREADY;
- hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
- io->timer.function = io_link_timeout_fn;
- hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
- return 0;
-}
-
-static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
- struct timespec64 *ts, enum hrtimer_mode mode)
- __must_hold(&ctx->timeout_lock)
-{
- struct io_cancel_data cd = { .data = user_data, };
- struct io_kiocb *req = io_timeout_extract(ctx, &cd);
- struct io_timeout_data *data;
-
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- req->timeout.off = 0; /* noseq */
- data = req->async_data;
- list_add_tail(&req->timeout.list, &ctx->timeout_list);
- hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
- data->timer.function = io_timeout_fn;
- hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
- return 0;
-}
-
-static int io_timeout_remove_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_timeout_rem *tr = &req->timeout_rem;
-
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
- return -EINVAL;
- if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
- return -EINVAL;
-
- tr->ltimeout = false;
- tr->addr = READ_ONCE(sqe->addr);
- tr->flags = READ_ONCE(sqe->timeout_flags);
- if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
- if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
- return -EINVAL;
- if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
- tr->ltimeout = true;
- if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
- return -EINVAL;
- if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
- return -EFAULT;
- if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
- return -EINVAL;
- } else if (tr->flags) {
- /* timeout removal doesn't support flags */
- return -EINVAL;
- }
-
- return 0;
-}
-
-static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
-{
- return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
- : HRTIMER_MODE_REL;
-}
-
-/*
- * Remove or update an existing timeout command
- */
-static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_timeout_rem *tr = &req->timeout_rem;
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
- struct io_cancel_data cd = { .data = tr->addr, };
-
- spin_lock(&ctx->completion_lock);
- ret = io_timeout_cancel(ctx, &cd);
- spin_unlock(&ctx->completion_lock);
- } else {
- enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
-
- spin_lock_irq(&ctx->timeout_lock);
- if (tr->ltimeout)
- ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
- else
- ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
- spin_unlock_irq(&ctx->timeout_lock);
- }
-
- if (ret < 0)
- req_set_fail(req);
- io_req_complete_post(req, ret, 0);
- return 0;
-}
-
-static int __io_timeout_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe,
- bool is_timeout_link)
-{
- struct io_timeout_data *data;
- unsigned flags;
- u32 off = READ_ONCE(sqe->off);
-
- if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
- return -EINVAL;
- if (off && is_timeout_link)
- return -EINVAL;
- flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
- IORING_TIMEOUT_ETIME_SUCCESS))
- return -EINVAL;
- /* more than one clock specified is invalid, obviously */
- if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
- return -EINVAL;
-
- INIT_LIST_HEAD(&req->timeout.list);
- req->timeout.off = off;
- if (unlikely(off && !req->ctx->off_timeout_used))
- req->ctx->off_timeout_used = true;
-
- if (WARN_ON_ONCE(req_has_async_data(req)))
- return -EFAULT;
- if (io_alloc_async_data(req))
- return -ENOMEM;
-
- data = req->async_data;
- data->req = req;
- data->flags = flags;
-
- if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
- return -EFAULT;
-
- if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
- return -EINVAL;
-
- INIT_LIST_HEAD(&req->timeout.list);
- data->mode = io_translate_timeout_mode(flags);
- hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
-
- if (is_timeout_link) {
- struct io_submit_link *link = &req->ctx->submit_state.link;
-
- if (!link->head)
- return -EINVAL;
- if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
- return -EINVAL;
- req->timeout.head = link->last;
- link->last->flags |= REQ_F_ARM_LTIMEOUT;
- }
- return 0;
-}
-
-static int io_timeout_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- return __io_timeout_prep(req, sqe, false);
-}
-
-static int io_link_timeout_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- return __io_timeout_prep(req, sqe, true);
-}
-
-static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_timeout_data *data = req->async_data;
- struct list_head *entry;
- u32 tail, off = req->timeout.off;
-
- spin_lock_irq(&ctx->timeout_lock);
-
- /*
- * sqe->off holds how many events that need to occur for this
- * timeout event to be satisfied. If it isn't set, then this is
- * a pure timeout request, sequence isn't used.
- */
- if (io_is_timeout_noseq(req)) {
- entry = ctx->timeout_list.prev;
- goto add;
- }
-
- tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
- req->timeout.target_seq = tail + off;
-
- /* Update the last seq here in case io_flush_timeouts() hasn't.
- * This is safe because ->completion_lock is held, and submissions
- * and completions are never mixed in the same ->completion_lock section.
- */
- ctx->cq_last_tm_flush = tail;
-
- /*
- * Insertion sort, ensuring the first entry in the list is always
- * the one we need first.
- */
- list_for_each_prev(entry, &ctx->timeout_list) {
- struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
- timeout.list);
-
- if (io_is_timeout_noseq(nxt))
- continue;
- /* nxt.seq is behind @tail, otherwise would've been completed */
- if (off >= nxt->timeout.target_seq - tail)
- break;
- }
-add:
- list_add(&req->timeout.list, entry);
- data->timer.function = io_timeout_fn;
- hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
- spin_unlock_irq(&ctx->timeout_lock);
- return 0;
-}
-
-static bool io_cancel_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_cancel_data *cd = data;
-
- if (req->ctx != cd->ctx)
- return false;
- if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
- ;
- } else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
- if (req->file != cd->file)
- return false;
- } else {
- if (req->cqe.user_data != cd->data)
- return false;
- }
- if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
- if (cd->seq == req->work.cancel_seq)
- return false;
- req->work.cancel_seq = cd->seq;
- }
- return true;
-}
-
-static int io_async_cancel_one(struct io_uring_task *tctx,
- struct io_cancel_data *cd)
-{
- enum io_wq_cancel cancel_ret;
- int ret = 0;
- bool all;
-
- if (!tctx || !tctx->io_wq)
- return -ENOENT;
-
- all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
- cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
- switch (cancel_ret) {
- case IO_WQ_CANCEL_OK:
- ret = 0;
- break;
- case IO_WQ_CANCEL_RUNNING:
- ret = -EALREADY;
- break;
- case IO_WQ_CANCEL_NOTFOUND:
- ret = -ENOENT;
- break;
- }
-
- return ret;
-}
-
-static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
-
- ret = io_async_cancel_one(req->task->io_uring, cd);
- /*
- * Fall-through even for -EALREADY, as we may have poll armed
- * that need unarming.
- */
- if (!ret)
- return 0;
-
- spin_lock(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, cd);
- if (ret != -ENOENT)
- goto out;
- if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
- ret = io_timeout_cancel(ctx, cd);
-out:
- spin_unlock(&ctx->completion_lock);
- return ret;
-}
-
-#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
- IORING_ASYNC_CANCEL_ANY)
-
-static int io_async_cancel_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
- if (sqe->off || sqe->len || sqe->splice_fd_in)
- return -EINVAL;
-
- req->cancel.addr = READ_ONCE(sqe->addr);
- req->cancel.flags = READ_ONCE(sqe->cancel_flags);
- if (req->cancel.flags & ~CANCEL_FLAGS)
- return -EINVAL;
- if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) {
- if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY)
- return -EINVAL;
- req->cancel.fd = READ_ONCE(sqe->fd);
- }
-
- return 0;
-}
-
-static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
- unsigned int issue_flags)
-{
- bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
- struct io_ring_ctx *ctx = cd->ctx;
- struct io_tctx_node *node;
- int ret, nr = 0;
-
- do {
- ret = io_try_cancel(req, cd);
- if (ret == -ENOENT)
- break;
- if (!all)
- return ret;
- nr++;
- } while (1);
-
- /* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, issue_flags);
- ret = -ENOENT;
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- ret = io_async_cancel_one(tctx, cd);
- if (ret != -ENOENT) {
- if (!all)
- break;
- nr++;
- }
- }
- io_ring_submit_unlock(ctx, issue_flags);
- return all ? nr : ret;
-}
-
-static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_cancel_data cd = {
- .ctx = req->ctx,
- .data = req->cancel.addr,
- .flags = req->cancel.flags,
- .seq = atomic_inc_return(&req->ctx->cancel_seq),
- };
- int ret;
-
- if (cd.flags & IORING_ASYNC_CANCEL_FD) {
- if (req->flags & REQ_F_FIXED_FILE)
- req->file = io_file_get_fixed(req, req->cancel.fd,
- issue_flags);
- else
- req->file = io_file_get_normal(req, req->cancel.fd);
- if (!req->file) {
- ret = -EBADF;
- goto done;
- }
- cd.file = req->file;
- }
-
- ret = __io_async_cancel(&cd, req, issue_flags);
-done:
- if (ret < 0)
- req_set_fail(req);
- io_req_complete_post(req, ret, 0);
- return 0;
-}
-
-static int io_files_update_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
- return -EINVAL;
- if (sqe->rw_flags || sqe->splice_fd_in)
- return -EINVAL;
-
- req->rsrc_update.offset = READ_ONCE(sqe->off);
- req->rsrc_update.nr_args = READ_ONCE(sqe->len);
- if (!req->rsrc_update.nr_args)
- return -EINVAL;
- req->rsrc_update.arg = READ_ONCE(sqe->addr);
- return 0;
-}
-
-static int io_files_update_with_index_alloc(struct io_kiocb *req,
- unsigned int issue_flags)
-{
- __s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg);
- unsigned int done;
- struct file *file;
- int ret, fd;
-
- for (done = 0; done < req->rsrc_update.nr_args; done++) {
- if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
- ret = -EFAULT;
- break;
- }
-
- file = fget(fd);
- if (!file) {
- ret = -EBADF;
- break;
- }
- ret = io_fixed_fd_install(req, issue_flags, file,
- IORING_FILE_INDEX_ALLOC);
- if (ret < 0)
- break;
- if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
- __io_close_fixed(req, issue_flags, ret);
- ret = -EFAULT;
- break;
- }
- }
-
- if (done)
- return done;
- return ret;
-}
-
-static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_rsrc_update2 up;
- int ret;
-
- up.offset = req->rsrc_update.offset;
- up.data = req->rsrc_update.arg;
- up.nr = 0;
- up.tags = 0;
- up.resv = 0;
- up.resv2 = 0;
-
- if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) {
- ret = io_files_update_with_index_alloc(req, issue_flags);
- } else {
- io_ring_submit_lock(ctx, issue_flags);
- ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
- &up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, issue_flags);
- }
-
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- switch (req->opcode) {
- case IORING_OP_NOP:
- return io_nop_prep(req, sqe);
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- return io_prep_rw(req, sqe);
- case IORING_OP_POLL_ADD:
- return io_poll_add_prep(req, sqe);
- case IORING_OP_POLL_REMOVE:
- return io_poll_remove_prep(req, sqe);
- case IORING_OP_FSYNC:
- return io_fsync_prep(req, sqe);
- case IORING_OP_SYNC_FILE_RANGE:
- return io_sfr_prep(req, sqe);
- case IORING_OP_SENDMSG:
- case IORING_OP_SEND:
- return io_sendmsg_prep(req, sqe);
- case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
- return io_recvmsg_prep(req, sqe);
- case IORING_OP_CONNECT:
- return io_connect_prep(req, sqe);
- case IORING_OP_TIMEOUT:
- return io_timeout_prep(req, sqe);
- case IORING_OP_TIMEOUT_REMOVE:
- return io_timeout_remove_prep(req, sqe);
- case IORING_OP_ASYNC_CANCEL:
- return io_async_cancel_prep(req, sqe);
- case IORING_OP_LINK_TIMEOUT:
- return io_link_timeout_prep(req, sqe);
- case IORING_OP_ACCEPT:
- return io_accept_prep(req, sqe);
- case IORING_OP_FALLOCATE:
- return io_fallocate_prep(req, sqe);
- case IORING_OP_OPENAT:
- return io_openat_prep(req, sqe);
- case IORING_OP_CLOSE:
- return io_close_prep(req, sqe);
- case IORING_OP_FILES_UPDATE:
- return io_files_update_prep(req, sqe);
- case IORING_OP_STATX:
- return io_statx_prep(req, sqe);
- case IORING_OP_FADVISE:
- return io_fadvise_prep(req, sqe);
- case IORING_OP_MADVISE:
- return io_madvise_prep(req, sqe);
- case IORING_OP_OPENAT2:
- return io_openat2_prep(req, sqe);
- case IORING_OP_EPOLL_CTL:
- return io_epoll_ctl_prep(req, sqe);
- case IORING_OP_SPLICE:
- return io_splice_prep(req, sqe);
- case IORING_OP_PROVIDE_BUFFERS:
- return io_provide_buffers_prep(req, sqe);
- case IORING_OP_REMOVE_BUFFERS:
- return io_remove_buffers_prep(req, sqe);
- case IORING_OP_TEE:
- return io_tee_prep(req, sqe);
- case IORING_OP_SHUTDOWN:
- return io_shutdown_prep(req, sqe);
- case IORING_OP_RENAMEAT:
- return io_renameat_prep(req, sqe);
- case IORING_OP_UNLINKAT:
- return io_unlinkat_prep(req, sqe);
- case IORING_OP_MKDIRAT:
- return io_mkdirat_prep(req, sqe);
- case IORING_OP_SYMLINKAT:
- return io_symlinkat_prep(req, sqe);
- case IORING_OP_LINKAT:
- return io_linkat_prep(req, sqe);
- case IORING_OP_MSG_RING:
- return io_msg_ring_prep(req, sqe);
- case IORING_OP_FSETXATTR:
- return io_fsetxattr_prep(req, sqe);
- case IORING_OP_SETXATTR:
- return io_setxattr_prep(req, sqe);
- case IORING_OP_FGETXATTR:
- return io_fgetxattr_prep(req, sqe);
- case IORING_OP_GETXATTR:
- return io_getxattr_prep(req, sqe);
- case IORING_OP_SOCKET:
- return io_socket_prep(req, sqe);
- case IORING_OP_URING_CMD:
- return io_uring_cmd_prep(req, sqe);
- }
-
- printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
- req->opcode);
- return -EINVAL;
-}
-
-static int io_req_prep_async(struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
-
- /* assign early for deferred execution for non-fixed file */
- if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
- req->file = io_file_get_normal(req, req->cqe.fd);
- if (!def->needs_async_setup)
- return 0;
- if (WARN_ON_ONCE(req_has_async_data(req)))
- return -EFAULT;
- if (io_alloc_async_data(req))
- return -EAGAIN;
-
- switch (req->opcode) {
- case IORING_OP_READV:
- return io_readv_prep_async(req);
- case IORING_OP_WRITEV:
- return io_writev_prep_async(req);
- case IORING_OP_SENDMSG:
- return io_sendmsg_prep_async(req);
- case IORING_OP_RECVMSG:
- return io_recvmsg_prep_async(req);
- case IORING_OP_CONNECT:
- return io_connect_prep_async(req);
- case IORING_OP_URING_CMD:
- return io_uring_cmd_prep_async(req);
- }
- printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
- req->opcode);
- return -EFAULT;
-}
-
-static u32 io_get_sequence(struct io_kiocb *req)
-{
- u32 seq = req->ctx->cached_sq_head;
- struct io_kiocb *cur;
-
- /* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(cur, req)
- seq--;
- return seq;
-}
-
-static __cold void io_drain_req(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_defer_entry *de;
- int ret;
- u32 seq = io_get_sequence(req);
-
- /* Still need defer if there is pending req in defer list. */
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
-queue:
- ctx->drain_active = false;
- io_req_task_queue(req);
- return;
- }
- spin_unlock(&ctx->completion_lock);
-
- ret = io_req_prep_async(req);
- if (ret) {
-fail:
- io_req_complete_failed(req, ret);
- return;
- }
- io_prep_async_link(req);
- de = kmalloc(sizeof(*de), GFP_KERNEL);
- if (!de) {
- ret = -ENOMEM;
- goto fail;
- }
-
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
- kfree(de);
- goto queue;
- }
-
- trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
- de->req = req;
- de->seq = seq;
- list_add_tail(&de->list, &ctx->defer_list);
- spin_unlock(&ctx->completion_lock);
-}
-
-static void io_clean_op(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_BUFFER_SELECTED) {
- spin_lock(&req->ctx->completion_lock);
- io_put_kbuf_comp(req);
- spin_unlock(&req->ctx->completion_lock);
- }
-
- if (req->flags & REQ_F_NEED_CLEANUP) {
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE: {
- struct io_async_rw *io = req->async_data;
-
- kfree(io->free_iovec);
- break;
- }
- case IORING_OP_RECVMSG:
- case IORING_OP_SENDMSG: {
- struct io_async_msghdr *io = req->async_data;
-
- kfree(io->free_iov);
- break;
- }
- case IORING_OP_OPENAT:
- case IORING_OP_OPENAT2:
- if (req->open.filename)
- putname(req->open.filename);
- break;
- case IORING_OP_RENAMEAT:
- putname(req->rename.oldpath);
- putname(req->rename.newpath);
- break;
- case IORING_OP_UNLINKAT:
- putname(req->unlink.filename);
- break;
- case IORING_OP_MKDIRAT:
- putname(req->mkdir.filename);
- break;
- case IORING_OP_SYMLINKAT:
- putname(req->symlink.oldpath);
- putname(req->symlink.newpath);
- break;
- case IORING_OP_LINKAT:
- putname(req->hardlink.oldpath);
- putname(req->hardlink.newpath);
- break;
- case IORING_OP_STATX:
- if (req->statx.filename)
- putname(req->statx.filename);
- break;
- case IORING_OP_SETXATTR:
- case IORING_OP_FSETXATTR:
- case IORING_OP_GETXATTR:
- case IORING_OP_FGETXATTR:
- __io_xattr_finish(req);
- break;
- }
- }
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- kfree(req->apoll->double_poll);
- kfree(req->apoll);
- req->apoll = NULL;
- }
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_uring_task *tctx = req->task->io_uring;
-
- atomic_dec(&tctx->inflight_tracked);
- }
- if (req->flags & REQ_F_CREDS)
- put_cred(req->creds);
- if (req->flags & REQ_F_ASYNC_DATA) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
- req->flags &= ~IO_REQ_CLEAN_FLAGS;
-}
-
-static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
-{
- if (req->file || !io_op_defs[req->opcode].needs_file)
- return true;
-
- if (req->flags & REQ_F_FIXED_FILE)
- req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
- else
- req->file = io_file_get_normal(req, req->cqe.fd);
-
- return !!req->file;
-}
-
-static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- const struct cred *creds = NULL;
- int ret;
-
- if (unlikely(!io_assign_file(req, issue_flags)))
- return -EBADF;
-
- if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
- creds = override_creds(req->creds);
-
- if (!def->audit_skip)
- audit_uring_entry(req->opcode);
-
- switch (req->opcode) {
- case IORING_OP_NOP:
- ret = io_nop(req, issue_flags);
- break;
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- ret = io_read(req, issue_flags);
- break;
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- ret = io_write(req, issue_flags);
- break;
- case IORING_OP_FSYNC:
- ret = io_fsync(req, issue_flags);
- break;
- case IORING_OP_POLL_ADD:
- ret = io_poll_add(req, issue_flags);
- break;
- case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove(req, issue_flags);
- break;
- case IORING_OP_SYNC_FILE_RANGE:
- ret = io_sync_file_range(req, issue_flags);
- break;
- case IORING_OP_SENDMSG:
- ret = io_sendmsg(req, issue_flags);
- break;
- case IORING_OP_SEND:
- ret = io_send(req, issue_flags);
- break;
- case IORING_OP_RECVMSG:
- ret = io_recvmsg(req, issue_flags);
- break;
- case IORING_OP_RECV:
- ret = io_recv(req, issue_flags);
- break;
- case IORING_OP_TIMEOUT:
- ret = io_timeout(req, issue_flags);
- break;
- case IORING_OP_TIMEOUT_REMOVE:
- ret = io_timeout_remove(req, issue_flags);
- break;
- case IORING_OP_ACCEPT:
- ret = io_accept(req, issue_flags);
- break;
- case IORING_OP_CONNECT:
- ret = io_connect(req, issue_flags);
- break;
- case IORING_OP_ASYNC_CANCEL:
- ret = io_async_cancel(req, issue_flags);
- break;
- case IORING_OP_FALLOCATE:
- ret = io_fallocate(req, issue_flags);
- break;
- case IORING_OP_OPENAT:
- ret = io_openat(req, issue_flags);
- break;
- case IORING_OP_CLOSE:
- ret = io_close(req, issue_flags);
- break;
- case IORING_OP_FILES_UPDATE:
- ret = io_files_update(req, issue_flags);
- break;
- case IORING_OP_STATX:
- ret = io_statx(req, issue_flags);
- break;
- case IORING_OP_FADVISE:
- ret = io_fadvise(req, issue_flags);
- break;
- case IORING_OP_MADVISE:
- ret = io_madvise(req, issue_flags);
- break;
- case IORING_OP_OPENAT2:
- ret = io_openat2(req, issue_flags);
- break;
- case IORING_OP_EPOLL_CTL:
- ret = io_epoll_ctl(req, issue_flags);
- break;
- case IORING_OP_SPLICE:
- ret = io_splice(req, issue_flags);
- break;
- case IORING_OP_PROVIDE_BUFFERS:
- ret = io_provide_buffers(req, issue_flags);
- break;
- case IORING_OP_REMOVE_BUFFERS:
- ret = io_remove_buffers(req, issue_flags);
- break;
- case IORING_OP_TEE:
- ret = io_tee(req, issue_flags);
- break;
- case IORING_OP_SHUTDOWN:
- ret = io_shutdown(req, issue_flags);
- break;
- case IORING_OP_RENAMEAT:
- ret = io_renameat(req, issue_flags);
- break;
- case IORING_OP_UNLINKAT:
- ret = io_unlinkat(req, issue_flags);
- break;
- case IORING_OP_MKDIRAT:
- ret = io_mkdirat(req, issue_flags);
- break;
- case IORING_OP_SYMLINKAT:
- ret = io_symlinkat(req, issue_flags);
- break;
- case IORING_OP_LINKAT:
- ret = io_linkat(req, issue_flags);
- break;
- case IORING_OP_MSG_RING:
- ret = io_msg_ring(req, issue_flags);
- break;
- case IORING_OP_FSETXATTR:
- ret = io_fsetxattr(req, issue_flags);
- break;
- case IORING_OP_SETXATTR:
- ret = io_setxattr(req, issue_flags);
- break;
- case IORING_OP_FGETXATTR:
- ret = io_fgetxattr(req, issue_flags);
- break;
- case IORING_OP_GETXATTR:
- ret = io_getxattr(req, issue_flags);
- break;
- case IORING_OP_SOCKET:
- ret = io_socket(req, issue_flags);
- break;
- case IORING_OP_URING_CMD:
- ret = io_uring_cmd(req, issue_flags);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- if (!def->audit_skip)
- audit_uring_exit(!ret, ret);
-
- if (creds)
- revert_creds(creds);
- if (ret)
- return ret;
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
- io_iopoll_req_issued(req, issue_flags);
-
- return 0;
-}
-
-static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- req = io_put_req_find_next(req);
- return req ? &req->work : NULL;
-}
-
-static void io_wq_submit_work(struct io_wq_work *work)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- const struct io_op_def *def = &io_op_defs[req->opcode];
- unsigned int issue_flags = IO_URING_F_UNLOCKED;
- bool needs_poll = false;
- int ret = 0, err = -ECANCELED;
-
- /* one will be dropped by ->io_free_work() after returning to io-wq */
- if (!(req->flags & REQ_F_REFCOUNT))
- __io_req_set_refcount(req, 2);
- else
- req_ref_get(req);
-
- io_arm_ltimeout(req);
-
- /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL) {
-fail:
- io_req_task_queue_fail(req, err);
- return;
- }
- if (!io_assign_file(req, issue_flags)) {
- err = -EBADF;
- work->flags |= IO_WQ_WORK_CANCEL;
- goto fail;
- }
-
- if (req->flags & REQ_F_FORCE_ASYNC) {
- bool opcode_poll = def->pollin || def->pollout;
-
- if (opcode_poll && file_can_poll(req->file)) {
- needs_poll = true;
- issue_flags |= IO_URING_F_NONBLOCK;
- }
- }
-
- do {
- ret = io_issue_sqe(req, issue_flags);
- if (ret != -EAGAIN)
- break;
- /*
- * We can get EAGAIN for iopolled IO even though we're
- * forcing a sync submission from here, since we can't
- * wait for request slots on the block side.
- */
- if (!needs_poll) {
- if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
- break;
- cond_resched();
- continue;
- }
-
- if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
- return;
- /* aborted or ready, in either case retry blocking */
- needs_poll = false;
- issue_flags &= ~IO_URING_F_NONBLOCK;
- } while (1);
-
- /* avoid locking problems by failing it from a clean context */
- if (ret)
- io_req_task_queue_fail(req, ret);
-}
-
-static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
- unsigned i)
-{
- return &table->files[i];
-}
-
-static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
- int index)
-{
- struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
-
- return (struct file *) (slot->file_ptr & FFS_MASK);
-}
-
-static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
-{
- unsigned long file_ptr = (unsigned long) file;
-
- file_ptr |= io_file_get_flags(file);
- file_slot->file_ptr = file_ptr;
-}
-
-static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
- unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct file *file = NULL;
- unsigned long file_ptr;
-
- io_ring_submit_lock(ctx, issue_flags);
-
- if (unlikely((unsigned int)fd >= ctx->nr_user_files))
- goto out;
- fd = array_index_nospec(fd, ctx->nr_user_files);
- file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
- file = (struct file *) (file_ptr & FFS_MASK);
- file_ptr &= ~FFS_MASK;
- /* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
- io_req_set_rsrc_node(req, ctx, 0);
- WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
-out:
- io_ring_submit_unlock(ctx, issue_flags);
- return file;
-}
-
-static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
-{
- struct file *file = fget(fd);
-
- trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
-
- /* we don't allow fixed io_uring files */
- if (file && file->f_op == &io_uring_fops)
- io_req_track_inflight(req);
- return file;
-}
-
-static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
-{
- struct io_kiocb *prev = req->timeout.prev;
- int ret = -ENOENT;
-
- if (prev) {
- if (!(req->task->flags & PF_EXITING)) {
- struct io_cancel_data cd = {
- .ctx = req->ctx,
- .data = prev->cqe.user_data,
- };
-
- ret = io_try_cancel(req, &cd);
- }
- io_req_complete_post(req, ret ?: -ETIME, 0);
- io_put_req(prev);
- } else {
- io_req_complete_post(req, -ETIME, 0);
- }
-}
-
-static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
-{
- struct io_timeout_data *data = container_of(timer,
- struct io_timeout_data, timer);
- struct io_kiocb *prev, *req = data->req;
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->timeout_lock, flags);
- prev = req->timeout.head;
- req->timeout.head = NULL;
-
- /*
- * We don't expect the list to be empty, that will only happen if we
- * race with the completion of the linked work.
- */
- if (prev) {
- io_remove_next_linked(prev);
- if (!req_ref_inc_not_zero(prev))
- prev = NULL;
- }
- list_del(&req->timeout.list);
- req->timeout.prev = prev;
- spin_unlock_irqrestore(&ctx->timeout_lock, flags);
-
- req->io_task_work.func = io_req_task_link_timeout;
- io_req_task_work_add(req);
- return HRTIMER_NORESTART;
-}
-
-static void io_queue_linked_timeout(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock_irq(&ctx->timeout_lock);
- /*
- * If the back reference is NULL, then our linked request finished
- * before we got a chance to setup the timer
- */
- if (req->timeout.head) {
- struct io_timeout_data *data = req->async_data;
-
- data->timer.function = io_link_timeout_fn;
- hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
- data->mode);
- list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
- }
- spin_unlock_irq(&ctx->timeout_lock);
- /* drop submission reference */
- io_put_req(req);
-}
-
-static void io_queue_async(struct io_kiocb *req, int ret)
- __must_hold(&req->ctx->uring_lock)
-{
- struct io_kiocb *linked_timeout;
-
- if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
- io_req_complete_failed(req, ret);
- return;
- }
-
- linked_timeout = io_prep_linked_timeout(req);
-
- switch (io_arm_poll_handler(req, 0)) {
- case IO_APOLL_READY:
- io_req_task_queue(req);
- break;
- case IO_APOLL_ABORTED:
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_kbuf_recycle(req, 0);
- io_queue_iowq(req, NULL);
- break;
- case IO_APOLL_OK:
- break;
- }
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
-}
-
-static inline void io_queue_sqe(struct io_kiocb *req)
- __must_hold(&req->ctx->uring_lock)
-{
- int ret;
-
- ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
-
- if (req->flags & REQ_F_COMPLETE_INLINE) {
- io_req_add_compl_list(req);
- return;
- }
- /*
- * We async punt it if the file wasn't marked NOWAIT, or if the file
- * doesn't support non-blocking read/write attempts
- */
- if (likely(!ret))
- io_arm_ltimeout(req);
- else
- io_queue_async(req, ret);
-}
-
-static void io_queue_sqe_fallback(struct io_kiocb *req)
- __must_hold(&req->ctx->uring_lock)
-{
- if (unlikely(req->flags & REQ_F_FAIL)) {
- /*
- * We don't submit, fail them all, for that replace hardlinks
- * with normal links. Extra REQ_F_LINK is tolerated.
- */
- req->flags &= ~REQ_F_HARDLINK;
- req->flags |= REQ_F_LINK;
- io_req_complete_failed(req, req->cqe.res);
- } else if (unlikely(req->ctx->drain_active)) {
- io_drain_req(req);
- } else {
- int ret = io_req_prep_async(req);
-
- if (unlikely(ret))
- io_req_complete_failed(req, ret);
- else
- io_queue_iowq(req, NULL);
- }
-}
-
-/*
- * Check SQE restrictions (opcode and flags).
- *
- * Returns 'true' if SQE is allowed, 'false' otherwise.
- */
-static inline bool io_check_restriction(struct io_ring_ctx *ctx,
- struct io_kiocb *req,
- unsigned int sqe_flags)
-{
- if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
- return false;
-
- if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
- ctx->restrictions.sqe_flags_required)
- return false;
-
- if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
- ctx->restrictions.sqe_flags_required))
- return false;
-
- return true;
-}
-
-static void io_init_req_drain(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *head = ctx->submit_state.link.head;
-
- ctx->drain_active = true;
- if (head) {
- /*
- * If we need to drain a request in the middle of a link, drain
- * the head request and the next request/link after the current
- * link. Considering sequential execution of links,
- * REQ_F_IO_DRAIN will be maintained for every request of our
- * link.
- */
- head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
- ctx->drain_next = true;
- }
-}
-
-static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
- __must_hold(&ctx->uring_lock)
-{
- const struct io_op_def *def;
- unsigned int sqe_flags;
- int personality;
- u8 opcode;
-
- /* req is partially pre-initialised, see io_preinit_req() */
- req->opcode = opcode = READ_ONCE(sqe->opcode);
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags = sqe_flags = READ_ONCE(sqe->flags);
- req->cqe.user_data = READ_ONCE(sqe->user_data);
- req->file = NULL;
- req->rsrc_node = NULL;
- req->task = current;
-
- if (unlikely(opcode >= IORING_OP_LAST)) {
- req->opcode = 0;
- return -EINVAL;
- }
- def = &io_op_defs[opcode];
- if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
- /* enforce forwards compatibility on users */
- if (sqe_flags & ~SQE_VALID_FLAGS)
- return -EINVAL;
- if (sqe_flags & IOSQE_BUFFER_SELECT) {
- if (!def->buffer_select)
- return -EOPNOTSUPP;
- req->buf_index = READ_ONCE(sqe->buf_group);
- }
- if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
- ctx->drain_disabled = true;
- if (sqe_flags & IOSQE_IO_DRAIN) {
- if (ctx->drain_disabled)
- return -EOPNOTSUPP;
- io_init_req_drain(req);
- }
- }
- if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
- if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
- /* knock it to the slow queue path, will be drained there */
- if (ctx->drain_active)
- req->flags |= REQ_F_FORCE_ASYNC;
- /* if there is no link, we're at "next" request and need to drain */
- if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
- ctx->drain_next = false;
- ctx->drain_active = true;
- req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
- }
- }
-
- if (!def->ioprio && sqe->ioprio)
- return -EINVAL;
- if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- if (def->needs_file) {
- struct io_submit_state *state = &ctx->submit_state;
-
- req->cqe.fd = READ_ONCE(sqe->fd);
-
- /*
- * Plug now if we have more than 2 IO left after this, and the
- * target is potentially a read/write to block based storage.
- */
- if (state->need_plug && def->plug) {
- state->plug_started = true;
- state->need_plug = false;
- blk_start_plug_nr_ios(&state->plug, state->submit_nr);
- }
- }
-
- personality = READ_ONCE(sqe->personality);
- if (personality) {
- int ret;
-
- req->creds = xa_load(&ctx->personalities, personality);
- if (!req->creds)
- return -EINVAL;
- get_cred(req->creds);
- ret = security_uring_override_creds(req->creds);
- if (ret) {
- put_cred(req->creds);
- return ret;
- }
- req->flags |= REQ_F_CREDS;
- }
-
- return io_req_prep(req, sqe);
-}
-
-static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
- struct io_kiocb *req, int ret)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_link *link = &ctx->submit_state.link;
- struct io_kiocb *head = link->head;
-
- trace_io_uring_req_failed(sqe, ctx, req, ret);
-
- /*
- * Avoid breaking links in the middle as it renders links with SQPOLL
- * unusable. Instead of failing eagerly, continue assembling the link if
- * applicable and mark the head with REQ_F_FAIL. The link flushing code
- * should find the flag and handle the rest.
- */
- req_fail_link_node(req, ret);
- if (head && !(head->flags & REQ_F_FAIL))
- req_fail_link_node(head, -ECANCELED);
-
- if (!(req->flags & IO_REQ_LINK_FLAGS)) {
- if (head) {
- link->last->link = req;
- link->head = NULL;
- req = head;
- }
- io_queue_sqe_fallback(req);
- return ret;
- }
-
- if (head)
- link->last->link = req;
- else
- link->head = req;
- link->last = req;
- return 0;
-}
-
-static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
- __must_hold(&ctx->uring_lock)
-{
- struct io_submit_link *link = &ctx->submit_state.link;
- int ret;
-
- ret = io_init_req(ctx, req, sqe);
- if (unlikely(ret))
- return io_submit_fail_init(sqe, req, ret);
-
- /* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
- req->flags, true,
- ctx->flags & IORING_SETUP_SQPOLL);
-
- /*
- * If we already have a head request, queue this one for async
- * submittal once the head completes. If we don't have a head but
- * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
- * submitted sync once the chain is complete. If none of those
- * conditions are true (normal request), then just queue it.
- */
- if (unlikely(link->head)) {
- ret = io_req_prep_async(req);
- if (unlikely(ret))
- return io_submit_fail_init(sqe, req, ret);
-
- trace_io_uring_link(ctx, req, link->head);
- link->last->link = req;
- link->last = req;
-
- if (req->flags & IO_REQ_LINK_FLAGS)
- return 0;
- /* last request of the link, flush it */
- req = link->head;
- link->head = NULL;
- if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
- goto fallback;
-
- } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
- REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
- if (req->flags & IO_REQ_LINK_FLAGS) {
- link->head = req;
- link->last = req;
- } else {
-fallback:
- io_queue_sqe_fallback(req);
- }
- return 0;
- }
-
- io_queue_sqe(req);
- return 0;
-}
-
-/*
- * Batched submission is done, ensure local IO is flushed out.
- */
-static void io_submit_state_end(struct io_ring_ctx *ctx)
-{
- struct io_submit_state *state = &ctx->submit_state;
-
- if (unlikely(state->link.head))
- io_queue_sqe_fallback(state->link.head);
- /* flush only after queuing links as they can generate completions */
- io_submit_flush_completions(ctx);
- if (state->plug_started)
- blk_finish_plug(&state->plug);
-}
-
-/*
- * Start submission side cache.
- */
-static void io_submit_state_start(struct io_submit_state *state,
- unsigned int max_ios)
-{
- state->plug_started = false;
- state->need_plug = max_ios > 2;
- state->submit_nr = max_ios;
- /* set only head, no need to init link_last in advance */
- state->link.head = NULL;
-}
-
-static void io_commit_sqring(struct io_ring_ctx *ctx)
-{
- struct io_rings *rings = ctx->rings;
-
- /*
- * Ensure any loads from the SQEs are done at this point,
- * since once we write the new head, the application could
- * write new data to them.
- */
- smp_store_release(&rings->sq.head, ctx->cached_sq_head);
-}
-
-/*
- * Fetch an sqe, if one is available. Note this returns a pointer to memory
- * that is mapped by userspace. This means that care needs to be taken to
- * ensure that reads are stable, as we cannot rely on userspace always
- * being a good citizen. If members of the sqe are validated and then later
- * used, it's important that those reads are done through READ_ONCE() to
- * prevent a re-load down the line.
- */
-static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
-{
- unsigned head, mask = ctx->sq_entries - 1;
- unsigned sq_idx = ctx->cached_sq_head++ & mask;
-
- /*
- * The cached sq head (or cq tail) serves two purposes:
- *
- * 1) allows us to batch the cost of updating the user visible
- * head updates.
- * 2) allows the kernel side to track the head on its own, even
- * though the application is the one updating it.
- */
- head = READ_ONCE(ctx->sq_array[sq_idx]);
- if (likely(head < ctx->sq_entries)) {
- /* double index for 128-byte SQEs, twice as long */
- if (ctx->flags & IORING_SETUP_SQE128)
- head <<= 1;
- return &ctx->sq_sqes[head];
- }
-
- /* drop invalid entries */
- ctx->cq_extra--;
- WRITE_ONCE(ctx->rings->sq_dropped,
- READ_ONCE(ctx->rings->sq_dropped) + 1);
- return NULL;
-}
-
-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
- __must_hold(&ctx->uring_lock)
-{
- unsigned int entries = io_sqring_entries(ctx);
- unsigned int left;
- int ret;
-
- if (unlikely(!entries))
- return 0;
- /* make sure SQ entry isn't read before tail */
- ret = left = min3(nr, ctx->sq_entries, entries);
- io_get_task_refs(left);
- io_submit_state_start(&ctx->submit_state, left);
-
- do {
- const struct io_uring_sqe *sqe;
- struct io_kiocb *req;
-
- if (unlikely(!io_alloc_req_refill(ctx)))
- break;
- req = io_alloc_req(ctx);
- sqe = io_get_sqe(ctx);
- if (unlikely(!sqe)) {
- io_req_add_to_cache(req, ctx);
- break;
- }
-
- /*
- * Continue submitting even for sqe failure if the
- * ring was setup with IORING_SETUP_SUBMIT_ALL
- */
- if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
- !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
- left--;
- break;
- }
- } while (--left);
-
- if (unlikely(left)) {
- ret -= left;
- /* try again if it submitted nothing and can't allocate a req */
- if (!ret && io_req_cache_empty(ctx))
- ret = -EAGAIN;
- current->io_uring->cached_refs += left;
- }
-
- io_submit_state_end(ctx);
- /* Commit SQ ring head once we've consumed and submitted all SQEs */
- io_commit_sqring(ctx);
- return ret;
-}
-
-static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
-{
- return READ_ONCE(sqd->state);
-}
-
-static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
-{
- unsigned int to_submit;
- int ret = 0;
-
- to_submit = io_sqring_entries(ctx);
- /* if we're handling multiple rings, cap submit size for fairness */
- if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
- to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
-
- if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
- const struct cred *creds = NULL;
-
- if (ctx->sq_creds != current_cred())
- creds = override_creds(ctx->sq_creds);
-
- mutex_lock(&ctx->uring_lock);
- if (!wq_list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, true);
-
- /*
- * Don't submit if refs are dying, good for io_uring_register(),
- * but also it is relied upon by io_ring_exit_work()
- */
- if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
- !(ctx->flags & IORING_SETUP_R_DISABLED))
- ret = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
-
- if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
- wake_up(&ctx->sqo_sq_wait);
- if (creds)
- revert_creds(creds);
- }
-
- return ret;
-}
-
-static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
-{
- struct io_ring_ctx *ctx;
- unsigned sq_thread_idle = 0;
-
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
- sqd->sq_thread_idle = sq_thread_idle;
-}
-
-static bool io_sqd_handle_event(struct io_sq_data *sqd)
-{
- bool did_sig = false;
- struct ksignal ksig;
-
- if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
- signal_pending(current)) {
- mutex_unlock(&sqd->lock);
- if (signal_pending(current))
- did_sig = get_signal(&ksig);
- cond_resched();
- mutex_lock(&sqd->lock);
- }
- return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
-}
-
-static int io_sq_thread(void *data)
-{
- struct io_sq_data *sqd = data;
- struct io_ring_ctx *ctx;
- unsigned long timeout = 0;
- char buf[TASK_COMM_LEN];
- DEFINE_WAIT(wait);
-
- snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
- set_task_comm(current, buf);
-
- if (sqd->sq_cpu != -1)
- set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
- else
- set_cpus_allowed_ptr(current, cpu_online_mask);
- current->flags |= PF_NO_SETAFFINITY;
-
- audit_alloc_kernel(current);
-
- mutex_lock(&sqd->lock);
- while (1) {
- bool cap_entries, sqt_spin = false;
-
- if (io_sqd_events_pending(sqd) || signal_pending(current)) {
- if (io_sqd_handle_event(sqd))
- break;
- timeout = jiffies + sqd->sq_thread_idle;
- }
-
- cap_entries = !list_is_singular(&sqd->ctx_list);
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- int ret = __io_sq_thread(ctx, cap_entries);
-
- if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
- sqt_spin = true;
- }
- if (io_run_task_work())
- sqt_spin = true;
-
- if (sqt_spin || !time_after(jiffies, timeout)) {
- cond_resched();
- if (sqt_spin)
- timeout = jiffies + sqd->sq_thread_idle;
- continue;
- }
-
- prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
- if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
- bool needs_sched = true;
-
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- atomic_or(IORING_SQ_NEED_WAKEUP,
- &ctx->rings->sq_flags);
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !wq_list_empty(&ctx->iopoll_list)) {
- needs_sched = false;
- break;
- }
-
- /*
- * Ensure the store of the wakeup flag is not
- * reordered with the load of the SQ tail
- */
- smp_mb__after_atomic();
-
- if (io_sqring_entries(ctx)) {
- needs_sched = false;
- break;
- }
- }
-
- if (needs_sched) {
- mutex_unlock(&sqd->lock);
- schedule();
- mutex_lock(&sqd->lock);
- }
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- atomic_andnot(IORING_SQ_NEED_WAKEUP,
- &ctx->rings->sq_flags);
- }
-
- finish_wait(&sqd->wait, &wait);
- timeout = jiffies + sqd->sq_thread_idle;
- }
-
- io_uring_cancel_generic(true, sqd);
- sqd->thread = NULL;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
- io_run_task_work();
- mutex_unlock(&sqd->lock);
-
- audit_free(current);
-
- complete(&sqd->exited);
- do_exit(0);
-}
-
-struct io_wait_queue {
- struct wait_queue_entry wq;
- struct io_ring_ctx *ctx;
- unsigned cq_tail;
- unsigned nr_timeouts;
-};
-
-static inline bool io_should_wake(struct io_wait_queue *iowq)
-{
- struct io_ring_ctx *ctx = iowq->ctx;
- int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
-
- /*
- * Wake up if we have enough events, or if a timeout occurred since we
- * started waiting. For timeouts, we always want to return to userspace,
- * regardless of event count.
- */
- return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
-}
-
-static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
- int wake_flags, void *key)
-{
- struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
- wq);
-
- /*
- * Cannot safely flush overflowed CQEs from here, ensure we wake up
- * the task, and the next invocation will do it.
- */
- if (io_should_wake(iowq) ||
- test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
- return autoremove_wake_function(curr, mode, wake_flags, key);
- return -1;
-}
-
-static int io_run_task_work_sig(void)
-{
- if (io_run_task_work())
- return 1;
- if (test_thread_flag(TIF_NOTIFY_SIGNAL))
- return -ERESTARTSYS;
- if (task_sigpending(current))
- return -EINTR;
- return 0;
-}
-
-/* when returns >0, the caller should retry */
-static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
- struct io_wait_queue *iowq,
- ktime_t timeout)
-{
- int ret;
- unsigned long check_cq;
-
- /* make sure we run task_work before checking for signals */
- ret = io_run_task_work_sig();
- if (ret || io_should_wake(iowq))
- return ret;
- check_cq = READ_ONCE(ctx->check_cq);
- /* let the caller flush overflows, retry */
- if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
- return 1;
- if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
- return -EBADR;
- if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
- return -ETIME;
- return 1;
-}
-
-/*
- * Wait until events become available, if we don't already have some. The
- * application must reap them itself, as they reside on the shared cq ring.
- */
-static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
- const sigset_t __user *sig, size_t sigsz,
- struct __kernel_timespec __user *uts)
-{
- struct io_wait_queue iowq;
- struct io_rings *rings = ctx->rings;
- ktime_t timeout = KTIME_MAX;
- int ret;
-
- do {
- io_cqring_overflow_flush(ctx);
- if (io_cqring_events(ctx) >= min_events)
- return 0;
- if (!io_run_task_work())
- break;
- } while (1);
-
- if (sig) {
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall())
- ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- sigsz);
- else
-#endif
- ret = set_user_sigmask(sig, sigsz);
-
- if (ret)
- return ret;
- }
-
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
- }
-
- init_waitqueue_func_entry(&iowq.wq, io_wake_function);
- iowq.wq.private = current;
- INIT_LIST_HEAD(&iowq.wq.entry);
- iowq.ctx = ctx;
- iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
- iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
-
- trace_io_uring_cqring_wait(ctx, min_events);
- do {
- /* if we can't even flush overflow, don't wait for more */
- if (!io_cqring_overflow_flush(ctx)) {
- ret = -EBUSY;
- break;
- }
- prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
- TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
- cond_resched();
- } while (ret > 0);
-
- finish_wait(&ctx->cq_wait, &iowq.wq);
- restore_saved_sigmask_unless(ret == -EINTR);
-
- return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
-}
-
-static void io_free_page_table(void **table, size_t size)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-
- for (i = 0; i < nr_tables; i++)
- kfree(table[i]);
- kfree(table);
-}
-
-static __cold void **io_alloc_page_table(size_t size)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
- size_t init_size = size;
- void **table;
-
- table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
- if (!table)
- return NULL;
-
- for (i = 0; i < nr_tables; i++) {
- unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
-
- table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
- if (!table[i]) {
- io_free_page_table(table, init_size);
- return NULL;
- }
- size -= this_size;
- }
- return table;
-}
-
-static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
-{
- percpu_ref_exit(&ref_node->refs);
- kfree(ref_node);
-}
-
-static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
-{
- struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
- struct io_ring_ctx *ctx = node->rsrc_data->ctx;
- unsigned long flags;
- bool first_add = false;
- unsigned long delay = HZ;
-
- spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
- node->done = true;
-
- /* if we are mid-quiesce then do not delay */
- if (node->rsrc_data->quiesce)
- delay = 0;
-
- while (!list_empty(&ctx->rsrc_ref_list)) {
- node = list_first_entry(&ctx->rsrc_ref_list,
- struct io_rsrc_node, node);
- /* recycle ref nodes in order */
- if (!node->done)
- break;
- list_del(&node->node);
- first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
- }
- spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
-
- if (first_add)
- mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
-}
-
-static struct io_rsrc_node *io_rsrc_node_alloc(void)
-{
- struct io_rsrc_node *ref_node;
-
- ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
- if (!ref_node)
- return NULL;
-
- if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
- 0, GFP_KERNEL)) {
- kfree(ref_node);
- return NULL;
- }
- INIT_LIST_HEAD(&ref_node->node);
- INIT_LIST_HEAD(&ref_node->rsrc_list);
- ref_node->done = false;
- return ref_node;
-}
-
-static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
- struct io_rsrc_data *data_to_kill)
- __must_hold(&ctx->uring_lock)
-{
- WARN_ON_ONCE(!ctx->rsrc_backup_node);
- WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
-
- io_rsrc_refs_drop(ctx);
-
- if (data_to_kill) {
- struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
-
- rsrc_node->rsrc_data = data_to_kill;
- spin_lock_irq(&ctx->rsrc_ref_lock);
- list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
- spin_unlock_irq(&ctx->rsrc_ref_lock);
-
- atomic_inc(&data_to_kill->refs);
- percpu_ref_kill(&rsrc_node->refs);
- ctx->rsrc_node = NULL;
- }
-
- if (!ctx->rsrc_node) {
- ctx->rsrc_node = ctx->rsrc_backup_node;
- ctx->rsrc_backup_node = NULL;
- }
-}
-
-static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
-{
- if (ctx->rsrc_backup_node)
- return 0;
- ctx->rsrc_backup_node = io_rsrc_node_alloc();
- return ctx->rsrc_backup_node ? 0 : -ENOMEM;
-}
-
-static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
- struct io_ring_ctx *ctx)
-{
- int ret;
-
- /* As we may drop ->uring_lock, other task may have started quiesce */
- if (data->quiesce)
- return -ENXIO;
-
- data->quiesce = true;
- do {
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- break;
- io_rsrc_node_switch(ctx, data);
-
- /* kill initial ref, already quiesced if zero */
- if (atomic_dec_and_test(&data->refs))
- break;
- mutex_unlock(&ctx->uring_lock);
- flush_delayed_work(&ctx->rsrc_put_work);
- ret = wait_for_completion_interruptible(&data->done);
- if (!ret) {
- mutex_lock(&ctx->uring_lock);
- if (atomic_read(&data->refs) > 0) {
- /*
- * it has been revived by another thread while
- * we were unlocked
- */
- mutex_unlock(&ctx->uring_lock);
- } else {
- break;
- }
- }
-
- atomic_inc(&data->refs);
- /* wait for all works potentially completing data->done */
- flush_delayed_work(&ctx->rsrc_put_work);
- reinit_completion(&data->done);
-
- ret = io_run_task_work_sig();
- mutex_lock(&ctx->uring_lock);
- } while (ret >= 0);
- data->quiesce = false;
-
- return ret;
-}
-
-static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
-{
- unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
- unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
-
- return &data->tags[table_idx][off];
-}
-
-static void io_rsrc_data_free(struct io_rsrc_data *data)
-{
- size_t size = data->nr * sizeof(data->tags[0][0]);
-
- if (data->tags)
- io_free_page_table((void **)data->tags, size);
- kfree(data);
-}
-
-static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
- u64 __user *utags, unsigned nr,
- struct io_rsrc_data **pdata)
-{
- struct io_rsrc_data *data;
- int ret = -ENOMEM;
- unsigned i;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
- data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
- if (!data->tags) {
- kfree(data);
- return -ENOMEM;
- }
-
- data->nr = nr;
- data->ctx = ctx;
- data->do_put = do_put;
- if (utags) {
- ret = -EFAULT;
- for (i = 0; i < nr; i++) {
- u64 *tag_slot = io_get_tag_slot(data, i);
-
- if (copy_from_user(tag_slot, &utags[i],
- sizeof(*tag_slot)))
- goto fail;
- }
- }
-
- atomic_set(&data->refs, 1);
- init_completion(&data->done);
- *pdata = data;
- return 0;
-fail:
- io_rsrc_data_free(data);
- return ret;
-}
-
-static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
-{
- table->files = kvcalloc(nr_files, sizeof(table->files[0]),
- GFP_KERNEL_ACCOUNT);
- if (unlikely(!table->files))
- return false;
-
- table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
- if (unlikely(!table->bitmap)) {
- kvfree(table->files);
- return false;
- }
-
- return true;
-}
-
-static void io_free_file_tables(struct io_file_table *table)
-{
- kvfree(table->files);
- bitmap_free(table->bitmap);
- table->files = NULL;
- table->bitmap = NULL;
-}
-
-static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
-{
- WARN_ON_ONCE(test_bit(bit, table->bitmap));
- __set_bit(bit, table->bitmap);
- table->alloc_hint = bit + 1;
-}
-
-static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
-{
- __clear_bit(bit, table->bitmap);
- table->alloc_hint = bit;
-}
-
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-#if !defined(IO_URING_SCM_ALL)
- int i;
-
- for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file = io_file_from_index(ctx, i);
-
- if (!file)
- continue;
- if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
- continue;
- io_file_bitmap_clear(&ctx->file_table, i);
- fput(file);
- }
-#endif
-
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
- kfree_skb(skb);
- }
-#endif
- io_free_file_tables(&ctx->file_table);
- io_rsrc_data_free(ctx->file_data);
- ctx->file_data = NULL;
- ctx->nr_user_files = 0;
-}
-
-static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
- unsigned nr = ctx->nr_user_files;
- int ret;
-
- if (!ctx->file_data)
- return -ENXIO;
-
- /*
- * Quiesce may unlock ->uring_lock, and while it's not held
- * prevent new requests using the table.
- */
- ctx->nr_user_files = 0;
- ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
- ctx->nr_user_files = nr;
- if (!ret)
- __io_sqe_files_unregister(ctx);
- return ret;
-}
-
-static void io_sq_thread_unpark(struct io_sq_data *sqd)
- __releases(&sqd->lock)
-{
- WARN_ON_ONCE(sqd->thread == current);
-
- /*
- * Do the dance but not conditional clear_bit() because it'd race with
- * other threads incrementing park_pending and setting the bit.
- */
- clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- if (atomic_dec_return(&sqd->park_pending))
- set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- mutex_unlock(&sqd->lock);
-}
-
-static void io_sq_thread_park(struct io_sq_data *sqd)
- __acquires(&sqd->lock)
-{
- WARN_ON_ONCE(sqd->thread == current);
-
- atomic_inc(&sqd->park_pending);
- set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- mutex_lock(&sqd->lock);
- if (sqd->thread)
- wake_up_process(sqd->thread);
-}
-
-static void io_sq_thread_stop(struct io_sq_data *sqd)
-{
- WARN_ON_ONCE(sqd->thread == current);
- WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
-
- set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
- mutex_lock(&sqd->lock);
- if (sqd->thread)
- wake_up_process(sqd->thread);
- mutex_unlock(&sqd->lock);
- wait_for_completion(&sqd->exited);
-}
-
-static void io_put_sq_data(struct io_sq_data *sqd)
-{
- if (refcount_dec_and_test(&sqd->refs)) {
- WARN_ON_ONCE(atomic_read(&sqd->park_pending));
-
- io_sq_thread_stop(sqd);
- kfree(sqd);
- }
-}
-
-static void io_sq_thread_finish(struct io_ring_ctx *ctx)
-{
- struct io_sq_data *sqd = ctx->sq_data;
-
- if (sqd) {
- io_sq_thread_park(sqd);
- list_del_init(&ctx->sqd_list);
- io_sqd_update_thread_idle(sqd);
- io_sq_thread_unpark(sqd);
-
- io_put_sq_data(sqd);
- ctx->sq_data = NULL;
- }
-}
-
-static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
-{
- struct io_ring_ctx *ctx_attach;
- struct io_sq_data *sqd;
- struct fd f;
-
- f = fdget(p->wq_fd);
- if (!f.file)
- return ERR_PTR(-ENXIO);
- if (f.file->f_op != &io_uring_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- ctx_attach = f.file->private_data;
- sqd = ctx_attach->sq_data;
- if (!sqd) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
- if (sqd->task_tgid != current->tgid) {
- fdput(f);
- return ERR_PTR(-EPERM);
- }
-
- refcount_inc(&sqd->refs);
- fdput(f);
- return sqd;
-}
-
-static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
- bool *attached)
-{
- struct io_sq_data *sqd;
-
- *attached = false;
- if (p->flags & IORING_SETUP_ATTACH_WQ) {
- sqd = io_attach_sq_data(p);
- if (!IS_ERR(sqd)) {
- *attached = true;
- return sqd;
- }
- /* fall through for EPERM case, setup new sqd/task */
- if (PTR_ERR(sqd) != -EPERM)
- return sqd;
- }
-
- sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
- if (!sqd)
- return ERR_PTR(-ENOMEM);
-
- atomic_set(&sqd->park_pending, 0);
- refcount_set(&sqd->refs, 1);
- INIT_LIST_HEAD(&sqd->ctx_list);
- mutex_init(&sqd->lock);
- init_waitqueue_head(&sqd->wait);
- init_completion(&sqd->exited);
- return sqd;
-}
-
-/*
- * Ensure the UNIX gc is aware of our file set, so we are certain that
- * the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing. We account only files that can hold other
- * files because otherwise they can't form a loop and so are not interesting
- * for GC.
- */
-static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
-{
-#if defined(CONFIG_UNIX)
- struct sock *sk = ctx->ring_sock->sk;
- struct sk_buff_head *head = &sk->sk_receive_queue;
- struct scm_fp_list *fpl;
- struct sk_buff *skb;
-
- if (likely(!io_file_need_scm(file)))
- return 0;
-
- /*
- * See if we can merge this file into an existing skb SCM_RIGHTS
- * file set. If there's no room, fall back to allocating a new skb
- * and filling it in.
- */
- spin_lock_irq(&head->lock);
- skb = skb_peek(head);
- if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
- __skb_unlink(skb, head);
- else
- skb = NULL;
- spin_unlock_irq(&head->lock);
-
- if (!skb) {
- fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
- if (!fpl)
- return -ENOMEM;
-
- skb = alloc_skb(0, GFP_KERNEL);
- if (!skb) {
- kfree(fpl);
- return -ENOMEM;
- }
-
- fpl->user = get_uid(current_user());
- fpl->max = SCM_MAX_FD;
- fpl->count = 0;
-
- UNIXCB(skb).fp = fpl;
- skb->sk = sk;
- skb->destructor = unix_destruct_scm;
- refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- }
-
- fpl = UNIXCB(skb).fp;
- fpl->fp[fpl->count++] = get_file(file);
- unix_inflight(fpl->user, file);
- skb_queue_head(head, skb);
- fput(file);
-#endif
- return 0;
-}
-
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
- struct file *file = prsrc->file;
-#if defined(CONFIG_UNIX)
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff_head list, *head = &sock->sk_receive_queue;
- struct sk_buff *skb;
- int i;
-
- if (!io_file_need_scm(file)) {
- fput(file);
- return;
- }
-
- __skb_queue_head_init(&list);
-
- /*
- * Find the skb that holds this file in its SCM_RIGHTS. When found,
- * remove this entry and rearrange the file array.
- */
- skb = skb_dequeue(head);
- while (skb) {
- struct scm_fp_list *fp;
-
- fp = UNIXCB(skb).fp;
- for (i = 0; i < fp->count; i++) {
- int left;
-
- if (fp->fp[i] != file)
- continue;
-
- unix_notinflight(fp->user, fp->fp[i]);
- left = fp->count - 1 - i;
- if (left) {
- memmove(&fp->fp[i], &fp->fp[i + 1],
- left * sizeof(struct file *));
- }
- fp->count--;
- if (!fp->count) {
- kfree_skb(skb);
- skb = NULL;
- } else {
- __skb_queue_tail(&list, skb);
- }
- fput(file);
- file = NULL;
- break;
- }
-
- if (!file)
- break;
-
- __skb_queue_tail(&list, skb);
-
- skb = skb_dequeue(head);
- }
-
- if (skb_peek(&list)) {
- spin_lock_irq(&head->lock);
- while ((skb = __skb_dequeue(&list)) != NULL)
- __skb_queue_tail(head, skb);
- spin_unlock_irq(&head->lock);
- }
-#else
- fput(file);
-#endif
-}
-
-static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
-{
- struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
- struct io_ring_ctx *ctx = rsrc_data->ctx;
- struct io_rsrc_put *prsrc, *tmp;
-
- list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
- list_del(&prsrc->list);
-
- if (prsrc->tag) {
- if (ctx->flags & IORING_SETUP_IOPOLL)
- mutex_lock(&ctx->uring_lock);
-
- spin_lock(&ctx->completion_lock);
- io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-
- if (ctx->flags & IORING_SETUP_IOPOLL)
- mutex_unlock(&ctx->uring_lock);
- }
-
- rsrc_data->do_put(ctx, prsrc);
- kfree(prsrc);
- }
-
- io_rsrc_node_destroy(ref_node);
- if (atomic_dec_and_test(&rsrc_data->refs))
- complete(&rsrc_data->done);
-}
-
-static void io_rsrc_put_work(struct work_struct *work)
-{
- struct io_ring_ctx *ctx;
- struct llist_node *node;
-
- ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
- node = llist_del_all(&ctx->rsrc_put_llist);
-
- while (node) {
- struct io_rsrc_node *ref_node;
- struct llist_node *next = node->next;
-
- ref_node = llist_entry(node, struct io_rsrc_node, llist);
- __io_rsrc_put_work(ref_node);
- node = next;
- }
-}
-
-static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args, u64 __user *tags)
-{
- __s32 __user *fds = (__s32 __user *) arg;
- struct file *file;
- int fd, ret;
- unsigned i;
-
- if (ctx->file_data)
- return -EBUSY;
- if (!nr_args)
- return -EINVAL;
- if (nr_args > IORING_MAX_FIXED_FILES)
- return -EMFILE;
- if (nr_args > rlimit(RLIMIT_NOFILE))
- return -EMFILE;
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- return ret;
- ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
- &ctx->file_data);
- if (ret)
- return ret;
-
- if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
- io_rsrc_data_free(ctx->file_data);
- ctx->file_data = NULL;
- return -ENOMEM;
- }
-
- for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- struct io_fixed_file *file_slot;
-
- if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
- ret = -EFAULT;
- goto fail;
- }
- /* allow sparse sets */
- if (!fds || fd == -1) {
- ret = -EINVAL;
- if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
- goto fail;
- continue;
- }
-
- file = fget(fd);
- ret = -EBADF;
- if (unlikely(!file))
- goto fail;
-
- /*
- * Don't allow io_uring instances to be registered. If UNIX
- * isn't enabled, then this causes a reference cycle and this
- * instance can never get freed. If UNIX is enabled we'll
- * handle it just fine, but there's still no point in allowing
- * a ring fd as it doesn't support regular read/write anyway.
- */
- if (file->f_op == &io_uring_fops) {
- fput(file);
- goto fail;
- }
- ret = io_scm_file_account(ctx, file);
- if (ret) {
- fput(file);
- goto fail;
- }
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
- io_fixed_file_set(file_slot, file);
- io_file_bitmap_set(&ctx->file_table, i);
- }
-
- io_rsrc_node_switch(ctx, NULL);
- return 0;
-fail:
- __io_sqe_files_unregister(ctx);
- return ret;
-}
-
-static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
- struct io_rsrc_node *node, void *rsrc)
-{
- u64 *tag_slot = io_get_tag_slot(data, idx);
- struct io_rsrc_put *prsrc;
-
- prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
- if (!prsrc)
- return -ENOMEM;
-
- prsrc->tag = *tag_slot;
- *tag_slot = 0;
- prsrc->rsrc = rsrc;
- list_add(&prsrc->list, &node->rsrc_list);
- return 0;
-}
-
-static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
- unsigned int issue_flags, u32 slot_index)
- __must_hold(&req->ctx->uring_lock)
-{
- struct io_ring_ctx *ctx = req->ctx;
- bool needs_switch = false;
- struct io_fixed_file *file_slot;
- int ret;
-
- if (file->f_op == &io_uring_fops)
- return -EBADF;
- if (!ctx->file_data)
- return -ENXIO;
- if (slot_index >= ctx->nr_user_files)
- return -EINVAL;
-
- slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-
- if (file_slot->file_ptr) {
- struct file *old_file;
-
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- goto err;
-
- old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
- ctx->rsrc_node, old_file);
- if (ret)
- goto err;
- file_slot->file_ptr = 0;
- io_file_bitmap_clear(&ctx->file_table, slot_index);
- needs_switch = true;
- }
-
- ret = io_scm_file_account(ctx, file);
- if (!ret) {
- *io_get_tag_slot(ctx->file_data, slot_index) = 0;
- io_fixed_file_set(file_slot, file);
- io_file_bitmap_set(&ctx->file_table, slot_index);
- }
-err:
- if (needs_switch)
- io_rsrc_node_switch(ctx, ctx->file_data);
- if (ret)
- fput(file);
- return ret;
-}
-
-static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
- unsigned int offset)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_fixed_file *file_slot;
- struct file *file;
- int ret;
-
- io_ring_submit_lock(ctx, issue_flags);
- ret = -ENXIO;
- if (unlikely(!ctx->file_data))
- goto out;
- ret = -EINVAL;
- if (offset >= ctx->nr_user_files)
- goto out;
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- goto out;
-
- offset = array_index_nospec(offset, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, offset);
- ret = -EBADF;
- if (!file_slot->file_ptr)
- goto out;
-
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
- if (ret)
- goto out;
-
- file_slot->file_ptr = 0;
- io_file_bitmap_clear(&ctx->file_table, offset);
- io_rsrc_node_switch(ctx, ctx->file_data);
- ret = 0;
-out:
- io_ring_submit_unlock(ctx, issue_flags);
- return ret;
-}
-
-static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
-{
- return __io_close_fixed(req, issue_flags, req->close.file_slot - 1);
-}
-
-static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_rsrc_update2 *up,
- unsigned nr_args)
-{
- u64 __user *tags = u64_to_user_ptr(up->tags);
- __s32 __user *fds = u64_to_user_ptr(up->data);
- struct io_rsrc_data *data = ctx->file_data;
- struct io_fixed_file *file_slot;
- struct file *file;
- int fd, i, err = 0;
- unsigned int done;
- bool needs_switch = false;
-
- if (!ctx->file_data)
- return -ENXIO;
- if (up->offset + nr_args > ctx->nr_user_files)
- return -EINVAL;
-
- for (done = 0; done < nr_args; done++) {
- u64 tag = 0;
-
- if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
- copy_from_user(&fd, &fds[done], sizeof(fd))) {
- err = -EFAULT;
- break;
- }
- if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
- err = -EINVAL;
- break;
- }
- if (fd == IORING_REGISTER_FILES_SKIP)
- continue;
-
- i = array_index_nospec(up->offset + done, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
-
- if (file_slot->file_ptr) {
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
- if (err)
- break;
- file_slot->file_ptr = 0;
- io_file_bitmap_clear(&ctx->file_table, i);
- needs_switch = true;
- }
- if (fd != -1) {
- file = fget(fd);
- if (!file) {
- err = -EBADF;
- break;
- }
- /*
- * Don't allow io_uring instances to be registered. If
- * UNIX isn't enabled, then this causes a reference
- * cycle and this instance can never get freed. If UNIX
- * is enabled we'll handle it just fine, but there's
- * still no point in allowing a ring fd as it doesn't
- * support regular read/write anyway.
- */
- if (file->f_op == &io_uring_fops) {
- fput(file);
- err = -EBADF;
- break;
- }
- err = io_scm_file_account(ctx, file);
- if (err) {
- fput(file);
- break;
- }
- *io_get_tag_slot(data, i) = tag;
- io_fixed_file_set(file_slot, file);
- io_file_bitmap_set(&ctx->file_table, i);
- }
- }
-
- if (needs_switch)
- io_rsrc_node_switch(ctx, data);
- return done ? done : err;
-}
-
-static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
- struct task_struct *task)
-{
- struct io_wq_hash *hash;
- struct io_wq_data data;
- unsigned int concurrency;
-
- mutex_lock(&ctx->uring_lock);
- hash = ctx->hash_map;
- if (!hash) {
- hash = kzalloc(sizeof(*hash), GFP_KERNEL);
- if (!hash) {
- mutex_unlock(&ctx->uring_lock);
- return ERR_PTR(-ENOMEM);
- }
- refcount_set(&hash->refs, 1);
- init_waitqueue_head(&hash->wait);
- ctx->hash_map = hash;
- }
- mutex_unlock(&ctx->uring_lock);
-
- data.hash = hash;
- data.task = task;
- data.free_work = io_wq_free_work;
- data.do_work = io_wq_submit_work;
-
- /* Do QD, or 4 * CPUS, whatever is smallest */
- concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-
- return io_wq_create(concurrency, &data);
-}
-
-static __cold int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
-{
- struct io_uring_task *tctx;
- int ret;
-
- tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
- if (unlikely(!tctx))
- return -ENOMEM;
-
- tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
- sizeof(struct file *), GFP_KERNEL);
- if (unlikely(!tctx->registered_rings)) {
- kfree(tctx);
- return -ENOMEM;
- }
-
- ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
- if (unlikely(ret)) {
- kfree(tctx->registered_rings);
- kfree(tctx);
- return ret;
- }
-
- tctx->io_wq = io_init_wq_offload(ctx, task);
- if (IS_ERR(tctx->io_wq)) {
- ret = PTR_ERR(tctx->io_wq);
- percpu_counter_destroy(&tctx->inflight);
- kfree(tctx->registered_rings);
- kfree(tctx);
- return ret;
- }
-
- xa_init(&tctx->xa);
- init_waitqueue_head(&tctx->wait);
- atomic_set(&tctx->in_idle, 0);
- atomic_set(&tctx->inflight_tracked, 0);
- task->io_uring = tctx;
- spin_lock_init(&tctx->task_lock);
- INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prio_task_list);
- init_task_work(&tctx->task_work, tctx_task_work);
- return 0;
-}
-
-void __io_uring_free(struct task_struct *tsk)
-{
- struct io_uring_task *tctx = tsk->io_uring;
-
- WARN_ON_ONCE(!xa_empty(&tctx->xa));
- WARN_ON_ONCE(tctx->io_wq);
- WARN_ON_ONCE(tctx->cached_refs);
-
- kfree(tctx->registered_rings);
- percpu_counter_destroy(&tctx->inflight);
- kfree(tctx);
- tsk->io_uring = NULL;
-}
-
-static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
-{
- int ret;
-
- /* Retain compatibility with failing for an invalid attach attempt */
- if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
- IORING_SETUP_ATTACH_WQ) {
- struct fd f;
-
- f = fdget(p->wq_fd);
- if (!f.file)
- return -ENXIO;
- if (f.file->f_op != &io_uring_fops) {
- fdput(f);
- return -EINVAL;
- }
- fdput(f);
- }
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- struct task_struct *tsk;
- struct io_sq_data *sqd;
- bool attached;
-
- ret = security_uring_sqpoll();
- if (ret)
- return ret;
-
- sqd = io_get_sq_data(p, &attached);
- if (IS_ERR(sqd)) {
- ret = PTR_ERR(sqd);
- goto err;
- }
-
- ctx->sq_creds = get_current_cred();
- ctx->sq_data = sqd;
- ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
- if (!ctx->sq_thread_idle)
- ctx->sq_thread_idle = HZ;
-
- io_sq_thread_park(sqd);
- list_add(&ctx->sqd_list, &sqd->ctx_list);
- io_sqd_update_thread_idle(sqd);
- /* don't attach to a dying SQPOLL thread, would be racy */
- ret = (attached && !sqd->thread) ? -ENXIO : 0;
- io_sq_thread_unpark(sqd);
-
- if (ret < 0)
- goto err;
- if (attached)
- return 0;
-
- if (p->flags & IORING_SETUP_SQ_AFF) {
- int cpu = p->sq_thread_cpu;
-
- ret = -EINVAL;
- if (cpu >= nr_cpu_ids || !cpu_online(cpu))
- goto err_sqpoll;
- sqd->sq_cpu = cpu;
- } else {
- sqd->sq_cpu = -1;
- }
-
- sqd->task_pid = current->pid;
- sqd->task_tgid = current->tgid;
- tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
- if (IS_ERR(tsk)) {
- ret = PTR_ERR(tsk);
- goto err_sqpoll;
- }
-
- sqd->thread = tsk;
- ret = io_uring_alloc_task_context(tsk, ctx);
- wake_up_new_task(tsk);
- if (ret)
- goto err;
- } else if (p->flags & IORING_SETUP_SQ_AFF) {
- /* Can't have SQ_AFF without SQPOLL */
- ret = -EINVAL;
- goto err;
- }
-
- return 0;
-err_sqpoll:
- complete(&ctx->sq_data->exited);
-err:
- io_sq_thread_finish(ctx);
- return ret;
-}
-
-static inline void __io_unaccount_mem(struct user_struct *user,
- unsigned long nr_pages)
-{
- atomic_long_sub(nr_pages, &user->locked_vm);
-}
-
-static inline int __io_account_mem(struct user_struct *user,
- unsigned long nr_pages)
-{
- unsigned long page_limit, cur_pages, new_pages;
-
- /* Don't allow more pages than we can safely lock */
- page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- do {
- cur_pages = atomic_long_read(&user->locked_vm);
- new_pages = cur_pages + nr_pages;
- if (new_pages > page_limit)
- return -ENOMEM;
- } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
- new_pages) != cur_pages);
-
- return 0;
-}
-
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
-{
- if (ctx->user)
- __io_unaccount_mem(ctx->user, nr_pages);
-
- if (ctx->mm_account)
- atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
-}
-
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
-{
- int ret;
-
- if (ctx->user) {
- ret = __io_account_mem(ctx->user, nr_pages);
- if (ret)
- return ret;
- }
-
- if (ctx->mm_account)
- atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
-
- return 0;
-}
-
-static void io_mem_free(void *ptr)
-{
- struct page *page;
-
- if (!ptr)
- return;
-
- page = virt_to_head_page(ptr);
- if (put_page_testzero(page))
- free_compound_page(page);
-}
-
-static void *io_mem_alloc(size_t size)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
-
- return (void *) __get_free_pages(gfp, get_order(size));
-}
-
-static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
- unsigned int cq_entries, size_t *sq_offset)
-{
- struct io_rings *rings;
- size_t off, sq_array_size;
-
- off = struct_size(rings, cqes, cq_entries);
- if (off == SIZE_MAX)
- return SIZE_MAX;
- if (ctx->flags & IORING_SETUP_CQE32) {
- if (check_shl_overflow(off, 1, &off))
- return SIZE_MAX;
- }
-
-#ifdef CONFIG_SMP
- off = ALIGN(off, SMP_CACHE_BYTES);
- if (off == 0)
- return SIZE_MAX;
-#endif
-
- if (sq_offset)
- *sq_offset = off;
-
- sq_array_size = array_size(sizeof(u32), sq_entries);
- if (sq_array_size == SIZE_MAX)
- return SIZE_MAX;
-
- if (check_add_overflow(off, sq_array_size, &off))
- return SIZE_MAX;
-
- return off;
-}
-
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
-{
- struct io_mapped_ubuf *imu = *slot;
- unsigned int i;
-
- if (imu != ctx->dummy_ubuf) {
- for (i = 0; i < imu->nr_bvecs; i++)
- unpin_user_page(imu->bvec[i].bv_page);
- if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages);
- kvfree(imu);
- }
- *slot = NULL;
-}
-
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
- io_buffer_unmap(ctx, &prsrc->buf);
- prsrc->buf = NULL;
-}
-
-static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
- unsigned int i;
-
- for (i = 0; i < ctx->nr_user_bufs; i++)
- io_buffer_unmap(ctx, &ctx->user_bufs[i]);
- kfree(ctx->user_bufs);
- io_rsrc_data_free(ctx->buf_data);
- ctx->user_bufs = NULL;
- ctx->buf_data = NULL;
- ctx->nr_user_bufs = 0;
-}
-
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
- unsigned nr = ctx->nr_user_bufs;
- int ret;
-
- if (!ctx->buf_data)
- return -ENXIO;
-
- /*
- * Quiesce may unlock ->uring_lock, and while it's not held
- * prevent new requests using the table.
- */
- ctx->nr_user_bufs = 0;
- ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
- ctx->nr_user_bufs = nr;
- if (!ret)
- __io_sqe_buffers_unregister(ctx);
- return ret;
-}
-
-static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
- void __user *arg, unsigned index)
-{
- struct iovec __user *src;
-
-#ifdef CONFIG_COMPAT
- if (ctx->compat) {
- struct compat_iovec __user *ciovs;
- struct compat_iovec ciov;
-
- ciovs = (struct compat_iovec __user *) arg;
- if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
- return -EFAULT;
-
- dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
- dst->iov_len = ciov.iov_len;
- return 0;
- }
-#endif
- src = (struct iovec __user *) arg;
- if (copy_from_user(dst, &src[index], sizeof(*dst)))
- return -EFAULT;
- return 0;
-}
-
-/*
- * Not super efficient, but this is just a registration time. And we do cache
- * the last compound head, so generally we'll only do a full search if we don't
- * match that one.
- *
- * We check if the given compound head page has already been accounted, to
- * avoid double accounting it. This allows us to account the full size of the
- * page, not just the constituent pages of a huge page.
- */
-static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
- int nr_pages, struct page *hpage)
-{
- int i, j;
-
- /* check current page array */
- for (i = 0; i < nr_pages; i++) {
- if (!PageCompound(pages[i]))
- continue;
- if (compound_head(pages[i]) == hpage)
- return true;
- }
-
- /* check previously registered pages */
- for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = ctx->user_bufs[i];
-
- for (j = 0; j < imu->nr_bvecs; j++) {
- if (!PageCompound(imu->bvec[j].bv_page))
- continue;
- if (compound_head(imu->bvec[j].bv_page) == hpage)
- return true;
- }
- }
-
- return false;
-}
-
-static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
- int nr_pages, struct io_mapped_ubuf *imu,
- struct page **last_hpage)
-{
- int i, ret;
-
- imu->acct_pages = 0;
- for (i = 0; i < nr_pages; i++) {
- if (!PageCompound(pages[i])) {
- imu->acct_pages++;
- } else {
- struct page *hpage;
-
- hpage = compound_head(pages[i]);
- if (hpage == *last_hpage)
- continue;
- *last_hpage = hpage;
- if (headpage_already_acct(ctx, pages, i, hpage))
- continue;
- imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
- }
- }
-
- if (!imu->acct_pages)
- return 0;
-
- ret = io_account_mem(ctx, imu->acct_pages);
- if (ret)
- imu->acct_pages = 0;
- return ret;
-}
-
-static struct page **io_pin_pages(unsigned long ubuf, unsigned long len,
- int *npages)
-{
- unsigned long start, end, nr_pages;
- struct vm_area_struct **vmas = NULL;
- struct page **pages = NULL;
- int i, pret, ret = -ENOMEM;
-
- end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ubuf >> PAGE_SHIFT;
- nr_pages = end - start;
-
- pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- goto done;
-
- vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas)
- goto done;
-
- ret = 0;
- mmap_read_lock(current->mm);
- pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- pages, vmas);
- if (pret == nr_pages) {
- /* don't support file backed memory */
- for (i = 0; i < nr_pages; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma_is_shmem(vma))
- continue;
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
- ret = -EOPNOTSUPP;
- break;
- }
- }
- *npages = nr_pages;
- } else {
- ret = pret < 0 ? pret : -EFAULT;
- }
- mmap_read_unlock(current->mm);
- if (ret) {
- /*
- * if we did partial map, or found file backed vmas,
- * release any pages we did get
- */
- if (pret > 0)
- unpin_user_pages(pages, pret);
- goto done;
- }
- ret = 0;
-done:
- kvfree(vmas);
- if (ret < 0) {
- kvfree(pages);
- pages = ERR_PTR(ret);
- }
- return pages;
-}
-
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
- struct io_mapped_ubuf **pimu,
- struct page **last_hpage)
-{
- struct io_mapped_ubuf *imu = NULL;
- struct page **pages = NULL;
- unsigned long off;
- size_t size;
- int ret, nr_pages, i;
-
- if (!iov->iov_base) {
- *pimu = ctx->dummy_ubuf;
- return 0;
- }
-
- *pimu = NULL;
- ret = -ENOMEM;
-
- pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
- &nr_pages);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- pages = NULL;
- goto done;
- }
-
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
- if (!imu)
- goto done;
-
- ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
- if (ret) {
- unpin_user_pages(pages, nr_pages);
- goto done;
- }
-
- off = (unsigned long) iov->iov_base & ~PAGE_MASK;
- size = iov->iov_len;
- for (i = 0; i < nr_pages; i++) {
- size_t vec_len;
-
- vec_len = min_t(size_t, size, PAGE_SIZE - off);
- imu->bvec[i].bv_page = pages[i];
- imu->bvec[i].bv_len = vec_len;
- imu->bvec[i].bv_offset = off;
- off = 0;
- size -= vec_len;
- }
- /* store original address for later verification */
- imu->ubuf = (unsigned long) iov->iov_base;
- imu->ubuf_end = imu->ubuf + iov->iov_len;
- imu->nr_bvecs = nr_pages;
- *pimu = imu;
- ret = 0;
-done:
- if (ret)
- kvfree(imu);
- kvfree(pages);
- return ret;
-}
-
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
-{
- ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
- return ctx->user_bufs ? 0 : -ENOMEM;
-}
-
-static int io_buffer_validate(struct iovec *iov)
-{
- unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
-
- /*
- * Don't impose further limits on the size and buffer
- * constraints here, we'll -EINVAL later when IO is
- * submitted if they are wrong.
- */
- if (!iov->iov_base)
- return iov->iov_len ? -EFAULT : 0;
- if (!iov->iov_len)
- return -EFAULT;
-
- /* arbitrary limit, but we need something */
- if (iov->iov_len > SZ_1G)
- return -EFAULT;
-
- if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
- return -EOVERFLOW;
-
- return 0;
-}
-
-static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args, u64 __user *tags)
-{
- struct page *last_hpage = NULL;
- struct io_rsrc_data *data;
- int i, ret;
- struct iovec iov;
-
- if (ctx->user_bufs)
- return -EBUSY;
- if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
- return -EINVAL;
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- return ret;
- ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
- if (ret)
- return ret;
- ret = io_buffers_map_alloc(ctx, nr_args);
- if (ret) {
- io_rsrc_data_free(data);
- return ret;
- }
-
- for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
- if (arg) {
- ret = io_copy_iov(ctx, &iov, arg, i);
- if (ret)
- break;
- ret = io_buffer_validate(&iov);
- if (ret)
- break;
- } else {
- memset(&iov, 0, sizeof(iov));
- }
-
- if (!iov.iov_base && *io_get_tag_slot(data, i)) {
- ret = -EINVAL;
- break;
- }
-
- ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
- &last_hpage);
- if (ret)
- break;
- }
-
- WARN_ON_ONCE(ctx->buf_data);
-
- ctx->buf_data = data;
- if (ret)
- __io_sqe_buffers_unregister(ctx);
- else
- io_rsrc_node_switch(ctx, NULL);
- return ret;
-}
-
-static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
- struct io_uring_rsrc_update2 *up,
- unsigned int nr_args)
-{
- u64 __user *tags = u64_to_user_ptr(up->tags);
- struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
- struct page *last_hpage = NULL;
- bool needs_switch = false;
- __u32 done;
- int i, err;
-
- if (!ctx->buf_data)
- return -ENXIO;
- if (up->offset + nr_args > ctx->nr_user_bufs)
- return -EINVAL;
-
- for (done = 0; done < nr_args; done++) {
- struct io_mapped_ubuf *imu;
- int offset = up->offset + done;
- u64 tag = 0;
-
- err = io_copy_iov(ctx, &iov, iovs, done);
- if (err)
- break;
- if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
- err = -EFAULT;
- break;
- }
- err = io_buffer_validate(&iov);
- if (err)
- break;
- if (!iov.iov_base && tag) {
- err = -EINVAL;
- break;
- }
- err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
- if (err)
- break;
-
- i = array_index_nospec(offset, ctx->nr_user_bufs);
- if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
- err = io_queue_rsrc_removal(ctx->buf_data, i,
- ctx->rsrc_node, ctx->user_bufs[i]);
- if (unlikely(err)) {
- io_buffer_unmap(ctx, &imu);
- break;
- }
- ctx->user_bufs[i] = NULL;
- needs_switch = true;
- }
-
- ctx->user_bufs[i] = imu;
- *io_get_tag_slot(ctx->buf_data, offset) = tag;
- }
-
- if (needs_switch)
- io_rsrc_node_switch(ctx, ctx->buf_data);
- return done ? done : err;
-}
-
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int eventfd_async)
-{
- struct io_ev_fd *ev_fd;
- __s32 __user *fds = arg;
- int fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd)
- return -EBUSY;
-
- if (copy_from_user(&fd, fds, sizeof(*fds)))
- return -EFAULT;
-
- ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
- if (!ev_fd)
- return -ENOMEM;
-
- ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ev_fd->cq_ev_fd)) {
- int ret = PTR_ERR(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- return ret;
- }
- ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
- rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
- return 0;
-}
-
-static void io_eventfd_put(struct rcu_head *rcu)
-{
- struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
-
- eventfd_ctx_put(ev_fd->cq_ev_fd);
- kfree(ev_fd);
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd) {
- ctx->has_evfd = false;
- rcu_assign_pointer(ctx->io_ev_fd, NULL);
- call_rcu(&ev_fd->rcu, io_eventfd_put);
- return 0;
- }
-
- return -ENXIO;
-}
-
-static void io_destroy_buffers(struct io_ring_ctx *ctx)
-{
- struct io_buffer_list *bl;
- unsigned long index;
- int i;
-
- for (i = 0; i < BGID_ARRAY; i++) {
- if (!ctx->io_bl)
- break;
- __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
- }
-
- xa_for_each(&ctx->io_bl_xa, index, bl) {
- xa_erase(&ctx->io_bl_xa, bl->bgid);
- __io_remove_buffers(ctx, bl, -1U);
- kfree(bl);
- }
-
- while (!list_empty(&ctx->io_buffers_pages)) {
- struct page *page;
-
- page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
- list_del_init(&page->lru);
- __free_page(page);
- }
-}
-
-static void io_req_caches_free(struct io_ring_ctx *ctx)
-{
- struct io_submit_state *state = &ctx->submit_state;
- int nr = 0;
-
- mutex_lock(&ctx->uring_lock);
- io_flush_cached_locked_reqs(ctx, state);
-
- while (!io_req_cache_empty(ctx)) {
- struct io_wq_work_node *node;
- struct io_kiocb *req;
-
- node = wq_stack_extract(&state->free_list);
- req = container_of(node, struct io_kiocb, comp_list);
- kmem_cache_free(req_cachep, req);
- nr++;
- }
- if (nr)
- percpu_ref_put_many(&ctx->refs, nr);
- mutex_unlock(&ctx->uring_lock);
-}
-
-static void io_wait_rsrc_data(struct io_rsrc_data *data)
-{
- if (data && !atomic_dec_and_test(&data->refs))
- wait_for_completion(&data->done);
-}
-
-static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
-{
- struct async_poll *apoll;
-
- while (!list_empty(&ctx->apoll_cache)) {
- apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
- poll.wait.entry);
- list_del(&apoll->poll.wait.entry);
- kfree(apoll);
- }
-}
-
-static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
-{
- io_sq_thread_finish(ctx);
-
- if (ctx->mm_account) {
- mmdrop(ctx->mm_account);
- ctx->mm_account = NULL;
- }
-
- io_rsrc_refs_drop(ctx);
- /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
- io_wait_rsrc_data(ctx->buf_data);
- io_wait_rsrc_data(ctx->file_data);
-
- mutex_lock(&ctx->uring_lock);
- if (ctx->buf_data)
- __io_sqe_buffers_unregister(ctx);
- if (ctx->file_data)
- __io_sqe_files_unregister(ctx);
- if (ctx->rings)
- __io_cqring_overflow_flush(ctx, true);
- io_eventfd_unregister(ctx);
- io_flush_apoll_cache(ctx);
- mutex_unlock(&ctx->uring_lock);
- io_destroy_buffers(ctx);
- if (ctx->sq_creds)
- put_cred(ctx->sq_creds);
-
- /* there are no registered resources left, nobody uses it */
- if (ctx->rsrc_node)
- io_rsrc_node_destroy(ctx->rsrc_node);
- if (ctx->rsrc_backup_node)
- io_rsrc_node_destroy(ctx->rsrc_backup_node);
- flush_delayed_work(&ctx->rsrc_put_work);
- flush_delayed_work(&ctx->fallback_work);
-
- WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
- WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
-
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- ctx->ring_sock->file = NULL; /* so that iput() is called */
- sock_release(ctx->ring_sock);
- }
-#endif
- WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
-
- io_mem_free(ctx->rings);
- io_mem_free(ctx->sq_sqes);
-
- percpu_ref_exit(&ctx->refs);
- free_uid(ctx->user);
- io_req_caches_free(ctx);
- if (ctx->hash_map)
- io_wq_put_hash(ctx->hash_map);
- kfree(ctx->cancel_hash);
- kfree(ctx->dummy_ubuf);
- kfree(ctx->io_bl);
- xa_destroy(&ctx->io_bl_xa);
- kfree(ctx);
-}
-
-static __poll_t io_uring_poll(struct file *file, poll_table *wait)
-{
- struct io_ring_ctx *ctx = file->private_data;
- __poll_t mask = 0;
-
- poll_wait(file, &ctx->cq_wait, wait);
- /*
- * synchronizes with barrier from wq_has_sleeper call in
- * io_commit_cqring
- */
- smp_rmb();
- if (!io_sqring_full(ctx))
- mask |= EPOLLOUT | EPOLLWRNORM;
-
- /*
- * Don't flush cqring overflow list here, just do a simple check.
- * Otherwise there could possible be ABBA deadlock:
- * CPU0 CPU1
- * ---- ----
- * lock(&ctx->uring_lock);
- * lock(&ep->mtx);
- * lock(&ctx->uring_lock);
- * lock(&ep->mtx);
- *
- * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
- * pushs them to do the flush.
- */
- if (io_cqring_events(ctx) ||
- test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
- mask |= EPOLLIN | EPOLLRDNORM;
-
- return mask;
-}
-
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
- const struct cred *creds;
-
- creds = xa_erase(&ctx->personalities, id);
- if (creds) {
- put_cred(creds);
- return 0;
- }
-
- return -EINVAL;
-}
-
-struct io_tctx_exit {
- struct callback_head task_work;
- struct completion completion;
- struct io_ring_ctx *ctx;
-};
-
-static __cold void io_tctx_exit_cb(struct callback_head *cb)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_exit *work;
-
- work = container_of(cb, struct io_tctx_exit, task_work);
- /*
- * When @in_idle, we're in cancellation and it's racy to remove the
- * node. It'll be removed by the end of cancellation, just ignore it.
- */
- if (!atomic_read(&tctx->in_idle))
- io_uring_del_tctx_node((unsigned long)work->ctx);
- complete(&work->completion);
-}
-
-static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- return req->ctx == data;
-}
-
-static __cold void io_ring_exit_work(struct work_struct *work)
-{
- struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
- unsigned long timeout = jiffies + HZ * 60 * 5;
- unsigned long interval = HZ / 20;
- struct io_tctx_exit exit;
- struct io_tctx_node *node;
- int ret;
-
- /*
- * If we're doing polled IO and end up having requests being
- * submitted async (out-of-line), then completions can come in while
- * we're waiting for refs to drop. We need to reap these manually,
- * as nobody else will be looking for them.
- */
- do {
- io_uring_try_cancel_requests(ctx, NULL, true);
- if (ctx->sq_data) {
- struct io_sq_data *sqd = ctx->sq_data;
- struct task_struct *tsk;
-
- io_sq_thread_park(sqd);
- tsk = sqd->thread;
- if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
- io_wq_cancel_cb(tsk->io_uring->io_wq,
- io_cancel_ctx_cb, ctx, true);
- io_sq_thread_unpark(sqd);
- }
-
- io_req_caches_free(ctx);
-
- if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
- /* there is little hope left, don't run it too often */
- interval = HZ * 60;
- }
- } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
-
- init_completion(&exit.completion);
- init_task_work(&exit.task_work, io_tctx_exit_cb);
- exit.ctx = ctx;
- /*
- * Some may use context even when all refs and requests have been put,
- * and they are free to do so while still holding uring_lock or
- * completion_lock, see io_req_task_submit(). Apart from other work,
- * this lock/unlock section also waits them to finish.
- */
- mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->tctx_list)) {
- WARN_ON_ONCE(time_after(jiffies, timeout));
-
- node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
- ctx_node);
- /* don't spin on a single task if cancellation failed */
- list_rotate_left(&ctx->tctx_list);
- ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
- if (WARN_ON_ONCE(ret))
- continue;
-
- mutex_unlock(&ctx->uring_lock);
- wait_for_completion(&exit.completion);
- mutex_lock(&ctx->uring_lock);
- }
- mutex_unlock(&ctx->uring_lock);
- spin_lock(&ctx->completion_lock);
- spin_unlock(&ctx->completion_lock);
-
- io_ring_ctx_free(ctx);
-}
-
-/* Returns true if we found and killed one or more timeouts */
-static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
- struct task_struct *tsk, bool cancel_all)
-{
- struct io_kiocb *req, *tmp;
- int canceled = 0;
-
- spin_lock(&ctx->completion_lock);
- spin_lock_irq(&ctx->timeout_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_match_task(req, tsk, cancel_all)) {
- io_kill_timeout(req, -ECANCELED);
- canceled++;
- }
- }
- spin_unlock_irq(&ctx->timeout_lock);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (canceled != 0)
- io_cqring_ev_posted(ctx);
- return canceled != 0;
-}
-
-static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
-{
- unsigned long index;
- struct creds *creds;
-
- mutex_lock(&ctx->uring_lock);
- percpu_ref_kill(&ctx->refs);
- if (ctx->rings)
- __io_cqring_overflow_flush(ctx, true);
- xa_for_each(&ctx->personalities, index, creds)
- io_unregister_personality(ctx, index);
- mutex_unlock(&ctx->uring_lock);
-
- /* failed during ring init, it couldn't have issued any requests */
- if (ctx->rings) {
- io_kill_timeouts(ctx, NULL, true);
- io_poll_remove_all(ctx, NULL, true);
- /* if we failed setting up the ctx, we might not have any rings */
- io_iopoll_try_reap_events(ctx);
- }
-
- INIT_WORK(&ctx->exit_work, io_ring_exit_work);
- /*
- * Use system_unbound_wq to avoid spawning tons of event kworkers
- * if we're exiting a ton of rings at the same time. It just adds
- * noise and overhead, there's no discernable change in runtime
- * over using system_wq.
- */
- queue_work(system_unbound_wq, &ctx->exit_work);
-}
-
-static int io_uring_release(struct inode *inode, struct file *file)
-{
- struct io_ring_ctx *ctx = file->private_data;
-
- file->private_data = NULL;
- io_ring_ctx_wait_and_kill(ctx);
- return 0;
-}
-
-struct io_task_cancel {
- struct task_struct *task;
- bool all;
-};
-
-static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_task_cancel *cancel = data;
-
- return io_match_task_safe(req, cancel->task, cancel->all);
-}
-
-static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
-{
- struct io_defer_entry *de;
- LIST_HEAD(list);
-
- spin_lock(&ctx->completion_lock);
- list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_task_safe(de->req, task, cancel_all)) {
- list_cut_position(&list, &ctx->defer_list, &de->list);
- break;
- }
- }
- spin_unlock(&ctx->completion_lock);
- if (list_empty(&list))
- return false;
-
- while (!list_empty(&list)) {
- de = list_first_entry(&list, struct io_defer_entry, list);
- list_del_init(&de->list);
- io_req_complete_failed(de->req, -ECANCELED);
- kfree(de);
- }
- return true;
-}
-
-static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
-{
- struct io_tctx_node *node;
- enum io_wq_cancel cret;
- bool ret = false;
-
- mutex_lock(&ctx->uring_lock);
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- /*
- * io_wq will stay alive while we hold uring_lock, because it's
- * killed after ctx nodes, which requires to take the lock.
- */
- if (!tctx || !tctx->io_wq)
- continue;
- cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
- ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
- }
- mutex_unlock(&ctx->uring_lock);
-
- return ret;
-}
-
-static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
-{
- struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
- struct io_uring_task *tctx = task ? task->io_uring : NULL;
-
- /* failed during ring init, it couldn't have issued any requests */
- if (!ctx->rings)
- return;
-
- while (1) {
- enum io_wq_cancel cret;
- bool ret = false;
-
- if (!task) {
- ret |= io_uring_try_cancel_iowq(ctx);
- } else if (tctx && tctx->io_wq) {
- /*
- * Cancels requests of all rings, not only @ctx, but
- * it's fine as the task is in exit/exec.
- */
- cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
- &cancel, true);
- ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
- }
-
- /* SQPOLL thread does its own polling */
- if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
- (ctx->sq_data && ctx->sq_data->thread == current)) {
- while (!wq_list_empty(&ctx->iopoll_list)) {
- io_iopoll_try_reap_events(ctx);
- ret = true;
- }
- }
-
- ret |= io_cancel_defer_files(ctx, task, cancel_all);
- ret |= io_poll_remove_all(ctx, task, cancel_all);
- ret |= io_kill_timeouts(ctx, task, cancel_all);
- if (task)
- ret |= io_run_task_work();
- if (!ret)
- break;
- cond_resched();
- }
-}
-
-static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_node *node;
- int ret;
-
- if (unlikely(!tctx)) {
- ret = io_uring_alloc_task_context(current, ctx);
- if (unlikely(ret))
- return ret;
-
- tctx = current->io_uring;
- if (ctx->iowq_limits_set) {
- unsigned int limits[2] = { ctx->iowq_limits[0],
- ctx->iowq_limits[1], };
-
- ret = io_wq_max_workers(tctx->io_wq, limits);
- if (ret)
- return ret;
- }
- }
- if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
- node = kmalloc(sizeof(*node), GFP_KERNEL);
- if (!node)
- return -ENOMEM;
- node->ctx = ctx;
- node->task = current;
-
- ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
- node, GFP_KERNEL));
- if (ret) {
- kfree(node);
- return ret;
- }
-
- mutex_lock(&ctx->uring_lock);
- list_add(&node->ctx_node, &ctx->tctx_list);
- mutex_unlock(&ctx->uring_lock);
- }
- tctx->last = ctx;
- return 0;
-}
-
-/*
- * Note that this task has used io_uring. We use it for cancelation purposes.
- */
-static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- if (likely(tctx && tctx->last == ctx))
- return 0;
- return __io_uring_add_tctx_node(ctx);
-}
-
-/*
- * Remove this io_uring_file -> task mapping.
- */
-static __cold void io_uring_del_tctx_node(unsigned long index)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_node *node;
-
- if (!tctx)
- return;
- node = xa_erase(&tctx->xa, index);
- if (!node)
- return;
-
- WARN_ON_ONCE(current != node->task);
- WARN_ON_ONCE(list_empty(&node->ctx_node));
-
- mutex_lock(&node->ctx->uring_lock);
- list_del(&node->ctx_node);
- mutex_unlock(&node->ctx->uring_lock);
-
- if (tctx->last == node->ctx)
- tctx->last = NULL;
- kfree(node);
-}
-
-static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
-{
- struct io_wq *wq = tctx->io_wq;
- struct io_tctx_node *node;
- unsigned long index;
-
- xa_for_each(&tctx->xa, index, node) {
- io_uring_del_tctx_node(index);
- cond_resched();
- }
- if (wq) {
- /*
- * Must be after io_uring_del_tctx_node() (removes nodes under
- * uring_lock) to avoid race with io_uring_try_cancel_iowq().
- */
- io_wq_put_and_exit(wq);
- tctx->io_wq = NULL;
- }
-}
-
-static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
-{
- if (tracked)
- return atomic_read(&tctx->inflight_tracked);
- return percpu_counter_sum(&tctx->inflight);
-}
-
-/*
- * Find any io_uring ctx that this task has registered or done IO on, and cancel
- * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
- */
-static __cold void io_uring_cancel_generic(bool cancel_all,
- struct io_sq_data *sqd)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_ring_ctx *ctx;
- s64 inflight;
- DEFINE_WAIT(wait);
-
- WARN_ON_ONCE(sqd && sqd->thread != current);
-
- if (!current->io_uring)
- return;
- if (tctx->io_wq)
- io_wq_exit_start(tctx->io_wq);
-
- atomic_inc(&tctx->in_idle);
- do {
- io_uring_drop_tctx_refs(current);
- /* read completions before cancelations */
- inflight = tctx_inflight(tctx, !cancel_all);
- if (!inflight)
- break;
-
- if (!sqd) {
- struct io_tctx_node *node;
- unsigned long index;
-
- xa_for_each(&tctx->xa, index, node) {
- /* sqpoll task will cancel all its requests */
- if (node->ctx->sq_data)
- continue;
- io_uring_try_cancel_requests(node->ctx, current,
- cancel_all);
- }
- } else {
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_uring_try_cancel_requests(ctx, current,
- cancel_all);
- }
-
- prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
- io_run_task_work();
- io_uring_drop_tctx_refs(current);
-
- /*
- * If we've seen completions, retry without waiting. This
- * avoids a race where a completion comes in before we did
- * prepare_to_wait().
- */
- if (inflight == tctx_inflight(tctx, !cancel_all))
- schedule();
- finish_wait(&tctx->wait, &wait);
- } while (1);
-
- io_uring_clean_tctx(tctx);
- if (cancel_all) {
- /*
- * We shouldn't run task_works after cancel, so just leave
- * ->in_idle set for normal exit.
- */
- atomic_dec(&tctx->in_idle);
- /* for exec all current's requests should be gone, kill tctx */
- __io_uring_free(current);
- }
-}
-
-void __io_uring_cancel(bool cancel_all)
-{
- io_uring_cancel_generic(cancel_all, NULL);
-}
-
-void io_uring_unreg_ringfd(void)
-{
- struct io_uring_task *tctx = current->io_uring;
- int i;
-
- for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
- if (tctx->registered_rings[i]) {
- fput(tctx->registered_rings[i]);
- tctx->registered_rings[i] = NULL;
- }
- }
-}
-
-static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
- int start, int end)
-{
- struct file *file;
- int offset;
-
- for (offset = start; offset < end; offset++) {
- offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
- if (tctx->registered_rings[offset])
- continue;
-
- file = fget(fd);
- if (!file) {
- return -EBADF;
- } else if (file->f_op != &io_uring_fops) {
- fput(file);
- return -EOPNOTSUPP;
- }
- tctx->registered_rings[offset] = file;
- return offset;
- }
-
- return -EBUSY;
-}
-
-/*
- * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
- * invocation. User passes in an array of struct io_uring_rsrc_update
- * with ->data set to the ring_fd, and ->offset given for the desired
- * index. If no index is desired, application may set ->offset == -1U
- * and we'll find an available index. Returns number of entries
- * successfully processed, or < 0 on error if none were processed.
- */
-static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
- unsigned nr_args)
-{
- struct io_uring_rsrc_update __user *arg = __arg;
- struct io_uring_rsrc_update reg;
- struct io_uring_task *tctx;
- int ret, i;
-
- if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
- return -EINVAL;
-
- mutex_unlock(&ctx->uring_lock);
- ret = io_uring_add_tctx_node(ctx);
- mutex_lock(&ctx->uring_lock);
- if (ret)
- return ret;
-
- tctx = current->io_uring;
- for (i = 0; i < nr_args; i++) {
- int start, end;
-
- if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
- ret = -EFAULT;
- break;
- }
-
- if (reg.resv) {
- ret = -EINVAL;
- break;
- }
-
- if (reg.offset == -1U) {
- start = 0;
- end = IO_RINGFD_REG_MAX;
- } else {
- if (reg.offset >= IO_RINGFD_REG_MAX) {
- ret = -EINVAL;
- break;
- }
- start = reg.offset;
- end = start + 1;
- }
-
- ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
- if (ret < 0)
- break;
-
- reg.offset = ret;
- if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
- fput(tctx->registered_rings[reg.offset]);
- tctx->registered_rings[reg.offset] = NULL;
- ret = -EFAULT;
- break;
- }
- }
-
- return i ? i : ret;
-}
-
-static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
- unsigned nr_args)
-{
- struct io_uring_rsrc_update __user *arg = __arg;
- struct io_uring_task *tctx = current->io_uring;
- struct io_uring_rsrc_update reg;
- int ret = 0, i;
-
- if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
- return -EINVAL;
- if (!tctx)
- return 0;
-
- for (i = 0; i < nr_args; i++) {
- if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
- ret = -EFAULT;
- break;
- }
- if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
- ret = -EINVAL;
- break;
- }
-
- reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
- if (tctx->registered_rings[reg.offset]) {
- fput(tctx->registered_rings[reg.offset]);
- tctx->registered_rings[reg.offset] = NULL;
- }
- }
-
- return i ? i : ret;
-}
-
-static void *io_uring_validate_mmap_request(struct file *file,
- loff_t pgoff, size_t sz)
-{
- struct io_ring_ctx *ctx = file->private_data;
- loff_t offset = pgoff << PAGE_SHIFT;
- struct page *page;
- void *ptr;
-
- switch (offset) {
- case IORING_OFF_SQ_RING:
- case IORING_OFF_CQ_RING:
- ptr = ctx->rings;
- break;
- case IORING_OFF_SQES:
- ptr = ctx->sq_sqes;
- break;
- default:
- return ERR_PTR(-EINVAL);
- }
-
- page = virt_to_head_page(ptr);
- if (sz > page_size(page))
- return ERR_PTR(-EINVAL);
-
- return ptr;
-}
-
-#ifdef CONFIG_MMU
-
-static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- size_t sz = vma->vm_end - vma->vm_start;
- unsigned long pfn;
- void *ptr;
-
- ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
-}
-
-#else /* !CONFIG_MMU */
-
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
-}
-
-static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
-{
- return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
-}
-
-static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- void *ptr;
-
- ptr = io_uring_validate_mmap_request(file, pgoff, len);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- return (unsigned long) ptr;
-}
-
-#endif /* !CONFIG_MMU */
-
-static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
-{
- DEFINE_WAIT(wait);
-
- do {
- if (!io_sqring_full(ctx))
- break;
- prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
-
- if (!io_sqring_full(ctx))
- break;
- schedule();
- } while (!signal_pending(current));
-
- finish_wait(&ctx->sqo_sq_wait, &wait);
- return 0;
-}
-
-static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
-{
- if (flags & IORING_ENTER_EXT_ARG) {
- struct io_uring_getevents_arg arg;
-
- if (argsz != sizeof(arg))
- return -EINVAL;
- if (copy_from_user(&arg, argp, sizeof(arg)))
- return -EFAULT;
- }
- return 0;
-}
-
-static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
- struct __kernel_timespec __user **ts,
- const sigset_t __user **sig)
-{
- struct io_uring_getevents_arg arg;
-
- /*
- * If EXT_ARG isn't set, then we have no timespec and the argp pointer
- * is just a pointer to the sigset_t.
- */
- if (!(flags & IORING_ENTER_EXT_ARG)) {
- *sig = (const sigset_t __user *) argp;
- *ts = NULL;
- return 0;
- }
-
- /*
- * EXT_ARG is set - ensure we agree on the size of it and copy in our
- * timespec and sigset_t pointers if good.
- */
- if (*argsz != sizeof(arg))
- return -EINVAL;
- if (copy_from_user(&arg, argp, sizeof(arg)))
- return -EFAULT;
- if (arg.pad)
- return -EINVAL;
- *sig = u64_to_user_ptr(arg.sigmask);
- *argsz = arg.sigmask_sz;
- *ts = u64_to_user_ptr(arg.ts);
- return 0;
-}
-
-SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
- u32, min_complete, u32, flags, const void __user *, argp,
- size_t, argsz)
-{
- struct io_ring_ctx *ctx;
- struct fd f;
- long ret;
-
- io_run_task_work();
-
- if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
- IORING_ENTER_REGISTERED_RING)))
- return -EINVAL;
-
- /*
- * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
- * need only dereference our task private array to find it.
- */
- if (flags & IORING_ENTER_REGISTERED_RING) {
- struct io_uring_task *tctx = current->io_uring;
-
- if (!tctx || fd >= IO_RINGFD_REG_MAX)
- return -EINVAL;
- fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- f.file = tctx->registered_rings[fd];
- f.flags = 0;
- } else {
- f = fdget(fd);
- }
-
- if (unlikely(!f.file))
- return -EBADF;
-
- ret = -EOPNOTSUPP;
- if (unlikely(f.file->f_op != &io_uring_fops))
- goto out_fput;
-
- ret = -ENXIO;
- ctx = f.file->private_data;
- if (unlikely(!percpu_ref_tryget(&ctx->refs)))
- goto out_fput;
-
- ret = -EBADFD;
- if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
- goto out;
-
- /*
- * For SQ polling, the thread will do all submissions and completions.
- * Just return the requested submit count, and wake the thread if
- * we were asked to.
- */
- ret = 0;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- io_cqring_overflow_flush(ctx);
-
- if (unlikely(ctx->sq_data->thread == NULL)) {
- ret = -EOWNERDEAD;
- goto out;
- }
- if (flags & IORING_ENTER_SQ_WAKEUP)
- wake_up(&ctx->sq_data->wait);
- if (flags & IORING_ENTER_SQ_WAIT) {
- ret = io_sqpoll_wait_sq(ctx);
- if (ret)
- goto out;
- }
- ret = to_submit;
- } else if (to_submit) {
- ret = io_uring_add_tctx_node(ctx);
- if (unlikely(ret))
- goto out;
-
- mutex_lock(&ctx->uring_lock);
- ret = io_submit_sqes(ctx, to_submit);
- if (ret != to_submit) {
- mutex_unlock(&ctx->uring_lock);
- goto out;
- }
- if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
- goto iopoll_locked;
- mutex_unlock(&ctx->uring_lock);
- }
- if (flags & IORING_ENTER_GETEVENTS) {
- int ret2;
- if (ctx->syscall_iopoll) {
- /*
- * We disallow the app entering submit/complete with
- * polling, but we still need to lock the ring to
- * prevent racing with polled issue that got punted to
- * a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
-iopoll_locked:
- ret2 = io_validate_ext_arg(flags, argp, argsz);
- if (likely(!ret2)) {
- min_complete = min(min_complete,
- ctx->cq_entries);
- ret2 = io_iopoll_check(ctx, min_complete);
- }
- mutex_unlock(&ctx->uring_lock);
- } else {
- const sigset_t __user *sig;
- struct __kernel_timespec __user *ts;
-
- ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
- if (likely(!ret2)) {
- min_complete = min(min_complete,
- ctx->cq_entries);
- ret2 = io_cqring_wait(ctx, min_complete, sig,
- argsz, ts);
- }
- }
-
- if (!ret) {
- ret = ret2;
-
- /*
- * EBADR indicates that one or more CQE were dropped.
- * Once the user has been informed we can clear the bit
- * as they are obviously ok with those drops.
- */
- if (unlikely(ret2 == -EBADR))
- clear_bit(IO_CHECK_CQ_DROPPED_BIT,
- &ctx->check_cq);
- }
- }
-
-out:
- percpu_ref_put(&ctx->refs);
-out_fput:
- fdput(f);
- return ret;
-}
-
-#ifdef CONFIG_PROC_FS
-static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
- const struct cred *cred)
-{
- struct user_namespace *uns = seq_user_ns(m);
- struct group_info *gi;
- kernel_cap_t cap;
- unsigned __capi;
- int g;
-
- seq_printf(m, "%5d\n", id);
- seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
- seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
- seq_puts(m, "\n\tGroups:\t");
- gi = cred->group_info;
- for (g = 0; g < gi->ngroups; g++) {
- seq_put_decimal_ull(m, g ? " " : "",
- from_kgid_munged(uns, gi->gid[g]));
- }
- seq_puts(m, "\n\tCapEff:\t");
- cap = cred->cap_effective;
- CAP_FOR_EACH_U32(__capi)
- seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
- seq_putc(m, '\n');
- return 0;
-}
-
-static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
- struct seq_file *m)
-{
- struct io_sq_data *sq = NULL;
- struct io_overflow_cqe *ocqe;
- struct io_rings *r = ctx->rings;
- unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
- unsigned int sq_head = READ_ONCE(r->sq.head);
- unsigned int sq_tail = READ_ONCE(r->sq.tail);
- unsigned int cq_head = READ_ONCE(r->cq.head);
- unsigned int cq_tail = READ_ONCE(r->cq.tail);
- unsigned int cq_shift = 0;
- unsigned int sq_entries, cq_entries;
- bool has_lock;
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
- unsigned int i;
-
- if (is_cqe32)
- cq_shift = 1;
-
- /*
- * we may get imprecise sqe and cqe info if uring is actively running
- * since we get cached_sq_head and cached_cq_tail without uring_lock
- * and sq_tail and cq_head are changed by userspace. But it's ok since
- * we usually use these info when it is stuck.
- */
- seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
- seq_printf(m, "SqHead:\t%u\n", sq_head);
- seq_printf(m, "SqTail:\t%u\n", sq_tail);
- seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
- seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
- seq_printf(m, "CqHead:\t%u\n", cq_head);
- seq_printf(m, "CqTail:\t%u\n", cq_tail);
- seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
- seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
- sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
- for (i = 0; i < sq_entries; i++) {
- unsigned int entry = i + sq_head;
- unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
- struct io_uring_sqe *sqe;
-
- if (sq_idx > sq_mask)
- continue;
- sqe = &ctx->sq_sqes[sq_idx];
- seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
- sq_idx, sqe->opcode, sqe->fd, sqe->flags,
- sqe->user_data);
- }
- seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
- cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
- for (i = 0; i < cq_entries; i++) {
- unsigned int entry = i + cq_head;
- struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
-
- if (!is_cqe32) {
- seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
- entry & cq_mask, cqe->user_data, cqe->res,
- cqe->flags);
- } else {
- seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
- "extra1:%llu, extra2:%llu\n",
- entry & cq_mask, cqe->user_data, cqe->res,
- cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
- }
- }
-
- /*
- * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
- * since fdinfo case grabs it in the opposite direction of normal use
- * cases. If we fail to get the lock, we just don't iterate any
- * structures that could be going away outside the io_uring mutex.
- */
- has_lock = mutex_trylock(&ctx->uring_lock);
-
- if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
- sq = ctx->sq_data;
- if (!sq->thread)
- sq = NULL;
- }
-
- seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
- seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
- seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
- for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
- struct file *f = io_file_from_index(ctx, i);
-
- if (f)
- seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
- else
- seq_printf(m, "%5u: <none>\n", i);
- }
- seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
- for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *buf = ctx->user_bufs[i];
- unsigned int len = buf->ubuf_end - buf->ubuf;
-
- seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
- }
- if (has_lock && !xa_empty(&ctx->personalities)) {
- unsigned long index;
- const struct cred *cred;
-
- seq_printf(m, "Personalities:\n");
- xa_for_each(&ctx->personalities, index, cred)
- io_uring_show_cred(m, index, cred);
- }
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
-
- seq_puts(m, "PollList:\n");
- spin_lock(&ctx->completion_lock);
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list = &ctx->cancel_hash[i];
- struct io_kiocb *req;
-
- hlist_for_each_entry(req, list, hash_node)
- seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
- task_work_pending(req->task));
- }
-
- seq_puts(m, "CqOverflowList:\n");
- list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
- struct io_uring_cqe *cqe = &ocqe->cqe;
-
- seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
- cqe->user_data, cqe->res, cqe->flags);
-
- }
-
- spin_unlock(&ctx->completion_lock);
-}
-
-static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
-{
- struct io_ring_ctx *ctx = f->private_data;
-
- if (percpu_ref_tryget(&ctx->refs)) {
- __io_uring_show_fdinfo(ctx, m);
- percpu_ref_put(&ctx->refs);
- }
-}
-#endif
-
-static const struct file_operations io_uring_fops = {
- .release = io_uring_release,
- .mmap = io_uring_mmap,
-#ifndef CONFIG_MMU
- .get_unmapped_area = io_uring_nommu_get_unmapped_area,
- .mmap_capabilities = io_uring_nommu_mmap_capabilities,
-#endif
- .poll = io_uring_poll,
-#ifdef CONFIG_PROC_FS
- .show_fdinfo = io_uring_show_fdinfo,
-#endif
-};
-
-static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
-{
- struct io_rings *rings;
- size_t size, sq_array_offset;
-
- /* make sure these are sane, as we already accounted them */
- ctx->sq_entries = p->sq_entries;
- ctx->cq_entries = p->cq_entries;
-
- size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- rings = io_mem_alloc(size);
- if (!rings)
- return -ENOMEM;
-
- ctx->rings = rings;
- ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
- rings->sq_ring_mask = p->sq_entries - 1;
- rings->cq_ring_mask = p->cq_entries - 1;
- rings->sq_ring_entries = p->sq_entries;
- rings->cq_ring_entries = p->cq_entries;
-
- if (p->flags & IORING_SETUP_SQE128)
- size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
- else
- size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
- if (size == SIZE_MAX) {
- io_mem_free(ctx->rings);
- ctx->rings = NULL;
- return -EOVERFLOW;
- }
-
- ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes) {
- io_mem_free(ctx->rings);
- ctx->rings = NULL;
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
-{
- int ret, fd;
-
- fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- ret = io_uring_add_tctx_node(ctx);
- if (ret) {
- put_unused_fd(fd);
- return ret;
- }
- fd_install(fd, file);
- return fd;
-}
-
-/*
- * Allocate an anonymous fd, this is what constitutes the application
- * visible backing of an io_uring instance. The application mmaps this
- * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
- * we have to tie this fd to a socket for file garbage collection purposes.
- */
-static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
-{
- struct file *file;
-#if defined(CONFIG_UNIX)
- int ret;
-
- ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
- &ctx->ring_sock);
- if (ret)
- return ERR_PTR(ret);
-#endif
-
- file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
- O_RDWR | O_CLOEXEC, NULL);
-#if defined(CONFIG_UNIX)
- if (IS_ERR(file)) {
- sock_release(ctx->ring_sock);
- ctx->ring_sock = NULL;
- } else {
- ctx->ring_sock->file = file;
- }
-#endif
- return file;
-}
-
-static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
-{
- struct io_ring_ctx *ctx;
- struct file *file;
- int ret;
-
- if (!entries)
- return -EINVAL;
- if (entries > IORING_MAX_ENTRIES) {
- if (!(p->flags & IORING_SETUP_CLAMP))
- return -EINVAL;
- entries = IORING_MAX_ENTRIES;
- }
-
- /*
- * Use twice as many entries for the CQ ring. It's possible for the
- * application to drive a higher depth than the size of the SQ ring,
- * since the sqes are only used at submission time. This allows for
- * some flexibility in overcommitting a bit. If the application has
- * set IORING_SETUP_CQSIZE, it will have passed in the desired number
- * of CQ ring entries manually.
- */
- p->sq_entries = roundup_pow_of_two(entries);
- if (p->flags & IORING_SETUP_CQSIZE) {
- /*
- * If IORING_SETUP_CQSIZE is set, we do the same roundup
- * to a power-of-two, if it isn't already. We do NOT impose
- * any cq vs sq ring sizing.
- */
- if (!p->cq_entries)
- return -EINVAL;
- if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
- if (!(p->flags & IORING_SETUP_CLAMP))
- return -EINVAL;
- p->cq_entries = IORING_MAX_CQ_ENTRIES;
- }
- p->cq_entries = roundup_pow_of_two(p->cq_entries);
- if (p->cq_entries < p->sq_entries)
- return -EINVAL;
- } else {
- p->cq_entries = 2 * p->sq_entries;
- }
-
- ctx = io_ring_ctx_alloc(p);
- if (!ctx)
- return -ENOMEM;
-
- /*
- * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
- * space applications don't need to do io completion events
- * polling again, they can rely on io_sq_thread to do polling
- * work, which can reduce cpu usage and uring_lock contention.
- */
- if (ctx->flags & IORING_SETUP_IOPOLL &&
- !(ctx->flags & IORING_SETUP_SQPOLL))
- ctx->syscall_iopoll = 1;
-
- ctx->compat = in_compat_syscall();
- if (!capable(CAP_IPC_LOCK))
- ctx->user = get_uid(current_user());
-
- /*
- * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
- * COOP_TASKRUN is set, then IPIs are never needed by the app.
- */
- ret = -EINVAL;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- /* IPI related flags don't make sense with SQPOLL */
- if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
- IORING_SETUP_TASKRUN_FLAG))
- goto err;
- ctx->notify_method = TWA_SIGNAL_NO_IPI;
- } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
- ctx->notify_method = TWA_SIGNAL_NO_IPI;
- } else {
- if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
- goto err;
- ctx->notify_method = TWA_SIGNAL;
- }
-
- /*
- * This is just grabbed for accounting purposes. When a process exits,
- * the mm is exited and dropped before the files, hence we need to hang
- * on to this mm purely for the purposes of being able to unaccount
- * memory (locked/pinned vm). It's not used for anything else.
- */
- mmgrab(current->mm);
- ctx->mm_account = current->mm;
-
- ret = io_allocate_scq_urings(ctx, p);
- if (ret)
- goto err;
-
- ret = io_sq_offload_create(ctx, p);
- if (ret)
- goto err;
- /* always set a rsrc node */
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- goto err;
- io_rsrc_node_switch(ctx, NULL);
-
- memset(&p->sq_off, 0, sizeof(p->sq_off));
- p->sq_off.head = offsetof(struct io_rings, sq.head);
- p->sq_off.tail = offsetof(struct io_rings, sq.tail);
- p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
- p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
- p->sq_off.flags = offsetof(struct io_rings, sq_flags);
- p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
- p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
-
- memset(&p->cq_off, 0, sizeof(p->cq_off));
- p->cq_off.head = offsetof(struct io_rings, cq.head);
- p->cq_off.tail = offsetof(struct io_rings, cq.tail);
- p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
- p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
- p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
- p->cq_off.cqes = offsetof(struct io_rings, cqes);
- p->cq_off.flags = offsetof(struct io_rings, cq_flags);
-
- p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
- IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
- IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
- IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
- IORING_FEAT_LINKED_FILE;
-
- if (copy_to_user(params, p, sizeof(*p))) {
- ret = -EFAULT;
- goto err;
- }
-
- file = io_uring_get_file(ctx);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto err;
- }
-
- /*
- * Install ring fd as the very last thing, so we don't risk someone
- * having closed it before we finish setup
- */
- ret = io_uring_install_fd(ctx, file);
- if (ret < 0) {
- /* fput will clean it up */
- fput(file);
- return ret;
- }
-
- trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
- return ret;
-err:
- io_ring_ctx_wait_and_kill(ctx);
- return ret;
-}
-
-/*
- * Sets up an aio uring context, and returns the fd. Applications asks for a
- * ring size, we return the actual sq/cq ring sizes (among other things) in the
- * params structure passed in.
- */
-static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
-{
- struct io_uring_params p;
- int i;
-
- if (copy_from_user(&p, params, sizeof(p)))
- return -EFAULT;
- for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
- if (p.resv[i])
- return -EINVAL;
- }
-
- if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
- IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
- IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
- IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
- IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
- return -EINVAL;
-
- return io_uring_create(entries, &p, params);
-}
-
-SYSCALL_DEFINE2(io_uring_setup, u32, entries,
- struct io_uring_params __user *, params)
-{
- return io_uring_setup(entries, params);
-}
-
-static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_probe *p;
- size_t size;
- int i, ret;
-
- size = struct_size(p, ops, nr_args);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
- p = kzalloc(size, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(p, arg, size))
- goto out;
- ret = -EINVAL;
- if (memchr_inv(p, 0, size))
- goto out;
-
- p->last_op = IORING_OP_LAST - 1;
- if (nr_args > IORING_OP_LAST)
- nr_args = IORING_OP_LAST;
-
- for (i = 0; i < nr_args; i++) {
- p->ops[i].op = i;
- if (!io_op_defs[i].not_supported)
- p->ops[i].flags = IO_URING_OP_SUPPORTED;
- }
- p->ops_len = i;
-
- ret = 0;
- if (copy_to_user(arg, p, size))
- ret = -EFAULT;
-out:
- kfree(p);
- return ret;
-}
-
-static int io_register_personality(struct io_ring_ctx *ctx)
-{
- const struct cred *creds;
- u32 id;
- int ret;
-
- creds = get_current_cred();
-
- ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
- XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
- if (ret < 0) {
- put_cred(creds);
- return ret;
- }
- return id;
-}
-
-static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
- void __user *arg, unsigned int nr_args)
-{
- struct io_uring_restriction *res;
- size_t size;
- int i, ret;
-
- /* Restrictions allowed only if rings started disabled */
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- /* We allow only a single restrictions registration */
- if (ctx->restrictions.registered)
- return -EBUSY;
-
- if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
- return -EINVAL;
-
- size = array_size(nr_args, sizeof(*res));
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- res = memdup_user(arg, size);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- ret = 0;
-
- for (i = 0; i < nr_args; i++) {
- switch (res[i].opcode) {
- case IORING_RESTRICTION_REGISTER_OP:
- if (res[i].register_op >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].register_op,
- ctx->restrictions.register_op);
- break;
- case IORING_RESTRICTION_SQE_OP:
- if (res[i].sqe_op >= IORING_OP_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
- break;
- case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
- ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
- break;
- case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
- ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
-
-out:
- /* Reset all restrictions if an error happened */
- if (ret != 0)
- memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
- else
- ctx->restrictions.registered = true;
-
- kfree(res);
- return ret;
-}
-
-static int io_register_enable_rings(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- if (ctx->restrictions.registered)
- ctx->restricted = 1;
-
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
- if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- return 0;
-}
-
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
- struct io_uring_rsrc_update2 *up,
- unsigned nr_args)
-{
- __u32 tmp;
- int err;
-
- if (check_add_overflow(up->offset, nr_args, &tmp))
- return -EOVERFLOW;
- err = io_rsrc_node_switch_start(ctx);
- if (err)
- return err;
-
- switch (type) {
- case IORING_RSRC_FILE:
- return __io_sqe_files_update(ctx, up, nr_args);
- case IORING_RSRC_BUFFER:
- return __io_sqe_buffers_update(ctx, up, nr_args);
- }
- return -EINVAL;
-}
-
-static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_rsrc_update2 up;
-
- if (!nr_args)
- return -EINVAL;
- memset(&up, 0, sizeof(up));
- if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
- return -EFAULT;
- if (up.resv || up.resv2)
- return -EINVAL;
- return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
-}
-
-static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
- unsigned size, unsigned type)
-{
- struct io_uring_rsrc_update2 up;
-
- if (size != sizeof(up))
- return -EINVAL;
- if (copy_from_user(&up, arg, sizeof(up)))
- return -EFAULT;
- if (!up.nr || up.resv || up.resv2)
- return -EINVAL;
- return __io_register_rsrc_update(ctx, type, &up, up.nr);
-}
-
-static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int size, unsigned int type)
-{
- struct io_uring_rsrc_register rr;
-
- /* keep it extendible */
- if (size != sizeof(rr))
- return -EINVAL;
-
- memset(&rr, 0, sizeof(rr));
- if (copy_from_user(&rr, arg, size))
- return -EFAULT;
- if (!rr.nr || rr.resv2)
- return -EINVAL;
- if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
- return -EINVAL;
-
- switch (type) {
- case IORING_RSRC_FILE:
- if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
- break;
- return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
- rr.nr, u64_to_user_ptr(rr.tags));
- case IORING_RSRC_BUFFER:
- if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
- break;
- return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
- rr.nr, u64_to_user_ptr(rr.tags));
- }
- return -EINVAL;
-}
-
-static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
- void __user *arg, unsigned len)
-{
- struct io_uring_task *tctx = current->io_uring;
- cpumask_var_t new_mask;
- int ret;
-
- if (!tctx || !tctx->io_wq)
- return -EINVAL;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_clear(new_mask);
- if (len > cpumask_size())
- len = cpumask_size();
-
- if (in_compat_syscall()) {
- ret = compat_get_bitmap(cpumask_bits(new_mask),
- (const compat_ulong_t __user *)arg,
- len * 8 /* CHAR_BIT */);
- } else {
- ret = copy_from_user(new_mask, arg, len);
- }
-
- if (ret) {
- free_cpumask_var(new_mask);
- return -EFAULT;
- }
-
- ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
- free_cpumask_var(new_mask);
- return ret;
-}
-
-static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- if (!tctx || !tctx->io_wq)
- return -EINVAL;
-
- return io_wq_cpu_affinity(tctx->io_wq, NULL);
-}
-
-static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
- __must_hold(&ctx->uring_lock)
-{
- struct io_tctx_node *node;
- struct io_uring_task *tctx = NULL;
- struct io_sq_data *sqd = NULL;
- __u32 new_count[2];
- int i, ret;
-
- if (copy_from_user(new_count, arg, sizeof(new_count)))
- return -EFAULT;
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i] > INT_MAX)
- return -EINVAL;
-
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- sqd = ctx->sq_data;
- if (sqd) {
- /*
- * Observe the correct sqd->lock -> ctx->uring_lock
- * ordering. Fine to drop uring_lock here, we hold
- * a ref to the ctx.
- */
- refcount_inc(&sqd->refs);
- mutex_unlock(&ctx->uring_lock);
- mutex_lock(&sqd->lock);
- mutex_lock(&ctx->uring_lock);
- if (sqd->thread)
- tctx = sqd->thread->io_uring;
- }
- } else {
- tctx = current->io_uring;
- }
-
- BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i])
- ctx->iowq_limits[i] = new_count[i];
- ctx->iowq_limits_set = true;
-
- if (tctx && tctx->io_wq) {
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
- } else {
- memset(new_count, 0, sizeof(new_count));
- }
-
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
-
- if (copy_to_user(arg, new_count, sizeof(new_count)))
- return -EFAULT;
-
- /* that's it for SQPOLL, only the SQPOLL task creates requests */
- if (sqd)
- return 0;
-
- /* now propagate the restriction to all registered users */
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- if (WARN_ON_ONCE(!tctx->io_wq))
- continue;
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- new_count[i] = ctx->iowq_limits[i];
- /* ignore errors, it always returns zero anyway */
- (void)io_wq_max_workers(tctx->io_wq, new_count);
- }
- return 0;
-err:
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
- return ret;
-}
-
-static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
-{
- struct io_uring_buf_ring *br;
- struct io_uring_buf_reg reg;
- struct io_buffer_list *bl;
- struct page **pages;
- int nr_pages;
-
- if (copy_from_user(&reg, arg, sizeof(reg)))
- return -EFAULT;
-
- if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
- return -EINVAL;
- if (!reg.ring_addr)
- return -EFAULT;
- if (reg.ring_addr & ~PAGE_MASK)
- return -EINVAL;
- if (!is_power_of_2(reg.ring_entries))
- return -EINVAL;
-
- /* cannot disambiguate full vs empty due to head/tail size */
- if (reg.ring_entries >= 65536)
- return -EINVAL;
-
- if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
- int ret = io_init_bl_list(ctx);
- if (ret)
- return ret;
- }
-
- bl = io_buffer_get_list(ctx, reg.bgid);
- if (bl) {
- /* if mapped buffer ring OR classic exists, don't allow */
- if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
- return -EEXIST;
- } else {
- bl = kzalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl)
- return -ENOMEM;
- }
-
- pages = io_pin_pages(reg.ring_addr,
- struct_size(br, bufs, reg.ring_entries),
- &nr_pages);
- if (IS_ERR(pages)) {
- kfree(bl);
- return PTR_ERR(pages);
- }
-
- br = page_address(pages[0]);
- bl->buf_pages = pages;
- bl->buf_nr_pages = nr_pages;
- bl->nr_entries = reg.ring_entries;
- bl->buf_ring = br;
- bl->mask = reg.ring_entries - 1;
- io_buffer_add_list(ctx, bl, reg.bgid);
- return 0;
-}
-
-static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
-{
- struct io_uring_buf_reg reg;
- struct io_buffer_list *bl;
-
- if (copy_from_user(&reg, arg, sizeof(reg)))
- return -EFAULT;
- if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
- return -EINVAL;
-
- bl = io_buffer_get_list(ctx, reg.bgid);
- if (!bl)
- return -ENOENT;
- if (!bl->buf_nr_pages)
- return -EINVAL;
-
- __io_remove_buffers(ctx, bl, -1U);
- if (bl->bgid >= BGID_ARRAY) {
- xa_erase(&ctx->io_bl_xa, bl->bgid);
- kfree(bl);
- }
- return 0;
-}
-
-static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
- void __user *arg, unsigned nr_args)
- __releases(ctx->uring_lock)
- __acquires(ctx->uring_lock)
-{
- int ret;
-
- /*
- * We're inside the ring mutex, if the ref is already dying, then
- * someone else killed the ctx or is already going through
- * io_uring_register().
- */
- if (percpu_ref_is_dying(&ctx->refs))
- return -ENXIO;
-
- if (ctx->restricted) {
- if (opcode >= IORING_REGISTER_LAST)
- return -EINVAL;
- opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
- if (!test_bit(opcode, ctx->restrictions.register_op))
- return -EACCES;
- }
-
- switch (opcode) {
- case IORING_REGISTER_BUFFERS:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_BUFFERS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_buffers_unregister(ctx);
- break;
- case IORING_REGISTER_FILES:
- ret = -EFAULT;
- if (!arg)
- break;
- ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_FILES:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_files_unregister(ctx);
- break;
- case IORING_REGISTER_FILES_UPDATE:
- ret = io_register_files_update(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_EVENTFD:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 0);
- break;
- case IORING_REGISTER_EVENTFD_ASYNC:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 1);
- break;
- case IORING_UNREGISTER_EVENTFD:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_eventfd_unregister(ctx);
- break;
- case IORING_REGISTER_PROBE:
- ret = -EINVAL;
- if (!arg || nr_args > 256)
- break;
- ret = io_probe(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_personality(ctx);
- break;
- case IORING_UNREGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg)
- break;
- ret = io_unregister_personality(ctx, nr_args);
- break;
- case IORING_REGISTER_ENABLE_RINGS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_enable_rings(ctx);
- break;
- case IORING_REGISTER_RESTRICTIONS:
- ret = io_register_restrictions(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_FILES2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_FILES_UPDATE2:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_BUFFERS2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_BUFFERS_UPDATE:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (!arg || !nr_args)
- break;
- ret = io_register_iowq_aff(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_unregister_iowq_aff(ctx);
- break;
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- ret = -EINVAL;
- if (!arg || nr_args != 2)
- break;
- ret = io_register_iowq_max_workers(ctx, arg);
- break;
- case IORING_REGISTER_RING_FDS:
- ret = io_ringfd_register(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_RING_FDS:
- ret = io_ringfd_unregister(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_register_pbuf_ring(ctx, arg);
- break;
- case IORING_UNREGISTER_PBUF_RING:
- ret = -EINVAL;
- if (!arg || nr_args != 1)
- break;
- ret = io_unregister_pbuf_ring(ctx, arg);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
- void __user *, arg, unsigned int, nr_args)
-{
- struct io_ring_ctx *ctx;
- long ret = -EBADF;
- struct fd f;
-
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
-
- ret = -EOPNOTSUPP;
- if (f.file->f_op != &io_uring_fops)
- goto out_fput;
-
- ctx = f.file->private_data;
-
- io_run_task_work();
-
- mutex_lock(&ctx->uring_lock);
- ret = __io_uring_register(ctx, opcode, arg, nr_args);
- mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
-out_fput:
- fdput(f);
- return ret;
-}
-
-static int __init io_uring_init(void)
-{
-#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
- BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
- BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
-} while (0)
-
-#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
- __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
- BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
- BUILD_BUG_SQE_ELEM(0, __u8, opcode);
- BUILD_BUG_SQE_ELEM(1, __u8, flags);
- BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
- BUILD_BUG_SQE_ELEM(4, __s32, fd);
- BUILD_BUG_SQE_ELEM(8, __u64, off);
- BUILD_BUG_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_SQE_ELEM(24, __u32, len);
- BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
- BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_SQE_ELEM(44, __u32, file_index);
- BUILD_BUG_SQE_ELEM(48, __u64, addr3);
-
- BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
- sizeof(struct io_uring_rsrc_update));
- BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
- sizeof(struct io_uring_rsrc_update2));
-
- /* ->buf_index is u16 */
- BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
- BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
- BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
- BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
- offsetof(struct io_uring_buf_ring, tail));
-
- /* should fit into one byte */
- BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
- BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
- BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
-
- BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
- BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
-
- BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
-
- BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
-
- req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
- SLAB_ACCOUNT);
- return 0;
-};
-__initcall(io_uring_init);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d2a9f699e17e..6505d45a99e0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
static struct bio_set iomap_ioend_bioset;
static struct iomap_page *
-iomap_page_create(struct inode *inode, struct folio *folio)
+iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
{
struct iomap_page *iop = to_iomap_page(folio);
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+ gfp_t gfp;
if (iop || nr_blocks <= 1)
return iop;
+ if (flags & IOMAP_NOWAIT)
+ gfp = GFP_NOWAIT;
+ else
+ gfp = GFP_NOFS | __GFP_NOFAIL;
+
iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
- GFP_NOFS | __GFP_NOFAIL);
- spin_lock_init(&iop->uptodate_lock);
- if (folio_test_uptodate(folio))
- bitmap_fill(iop->uptodate, nr_blocks);
- folio_attach_private(folio, iop);
+ gfp);
+ if (iop) {
+ spin_lock_init(&iop->uptodate_lock);
+ if (folio_test_uptodate(folio))
+ bitmap_fill(iop->uptodate, nr_blocks);
+ folio_attach_private(folio, iop);
+ }
return iop;
}
@@ -154,9 +162,6 @@ static void iomap_iop_set_range_uptodate(struct folio *folio,
static void iomap_set_range_uptodate(struct folio *folio,
struct iomap_page *iop, size_t off, size_t len)
{
- if (folio_test_error(folio))
- return;
-
if (iop)
iomap_iop_set_range_uptodate(folio, iop, off, len);
else
@@ -226,7 +231,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
- iop = iomap_page_create(iter->inode, folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
else
iop = to_iomap_page(folio);
@@ -264,7 +269,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */
- iop = iomap_page_create(iter->inode, folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
@@ -492,31 +497,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
}
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
-#ifdef CONFIG_MIGRATION
-int
-iomap_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
-{
- struct folio *folio = page_folio(page);
- struct folio *newfolio = page_folio(newpage);
- int ret;
-
- ret = folio_migrate_mapping(mapping, newfolio, folio, 0);
- if (ret != MIGRATEPAGE_SUCCESS)
- return ret;
-
- if (folio_test_private(folio))
- folio_attach_private(newfolio, folio_detach_private(folio));
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- folio_migrate_copy(newfolio, folio);
- else
- folio_migrate_flags(newfolio, folio);
- return MIGRATEPAGE_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(iomap_migrate_page);
-#endif /* CONFIG_MIGRATION */
-
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -547,10 +527,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct iomap_page *iop = iomap_page_create(iter->inode, folio);
+ struct iomap_page *iop;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
+ unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
@@ -558,6 +539,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0;
folio_clear_error(folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
+ if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
+ return -EAGAIN;
+
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen);
@@ -574,7 +559,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return -EIO;
folio_zero_segments(folio, poff, from, to, poff + plen);
} else {
- int status = iomap_read_folio_sync(block_start, folio,
+ int status;
+
+ if (iter->flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ status = iomap_read_folio_sync(block_start, folio,
poff, plen, srcmap);
if (status)
return status;
@@ -603,6 +593,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0;
+ if (iter->flags & IOMAP_NOWAIT)
+ fgp |= FGP_NOWAIT;
+
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -622,7 +615,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
if (!folio) {
- status = -ENOMEM;
+ status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
goto out_no_page;
}
if (pos + len > folio_pos(folio) + folio_size(folio))
@@ -740,6 +733,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
loff_t pos = iter->pos;
ssize_t written = 0;
long status = 0;
+ struct address_space *mapping = iter->inode->i_mapping;
+ unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
do {
struct folio *folio;
@@ -752,6 +747,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
+ status = balance_dirty_pages_ratelimited_flags(mapping,
+ bdp_flags);
+ if (unlikely(status))
+ break;
+
if (bytes > length)
bytes = length;
@@ -760,6 +760,10 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
+ *
+ * For async buffered writes the assumption is that the user
+ * page has already been faulted in. This can be optimized by
+ * faulting the user page.
*/
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
@@ -771,7 +775,7 @@ again:
break;
page = folio_file_page(folio, pos >> PAGE_SHIFT);
- if (mapping_writably_mapped(iter->inode->i_mapping))
+ if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
@@ -796,10 +800,12 @@ again:
pos += status;
written += status;
length -= status;
-
- balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (iov_iter_count(i) && length);
+ if (status == -EAGAIN) {
+ iov_iter_revert(i, written);
+ return -EAGAIN;
+ }
return written ? written : status;
}
@@ -815,6 +821,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
};
int ret;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ iter.flags |= IOMAP_NOWAIT;
+
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i);
if (iter.pos == iocb->ki_pos)
@@ -1329,7 +1338,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos)
{
- struct iomap_page *iop = iomap_page_create(inode, folio);
+ struct iomap_page *iop = iomap_page_create(inode, folio, 0);
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 370c3241618a..5d098adba443 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -242,7 +242,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
struct inode *inode = iter->inode;
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
- unsigned int align = iov_iter_alignment(dio->submit.iter);
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
unsigned int bio_opf;
@@ -253,7 +252,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
- if ((pos | length | align) & ((1 << blkbits) - 1))
+ if ((pos | length) & ((1 << blkbits) - 1) ||
+ !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
if (iomap->type == IOMAP_UNWRITTEN) {
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 1d732fd223d4..332dc9ac47a9 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (rc)
return rc;
- if (is_quota_modification(inode, iattr)) {
+ if (is_quota_modification(mnt_userns, inode, iattr)) {
rc = dquot_initialize(inode);
if (rc)
return rc;
}
if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
(iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
- rc = dquot_transfer(inode, iattr);
+ rc = dquot_transfer(mnt_userns, inode, iattr);
if (rc)
return rc;
}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 259326556ada..d1ec920aa030 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -301,13 +301,25 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
{
int ret;
- ret = nobh_write_begin(mapping, pos, len, pagep, fsdata, jfs_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block);
if (unlikely(ret))
jfs_write_failed(mapping, pos + len);
return ret;
}
+static int jfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied, struct page *page,
+ void *fsdata)
+{
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ret < len)
+ jfs_write_failed(mapping, pos + len);
+ return ret;
+}
+
static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping, block, jfs_get_block);
@@ -346,7 +358,7 @@ const struct address_space_operations jfs_aops = {
.writepage = jfs_writepage,
.writepages = jfs_writepages,
.write_begin = jfs_write_begin,
- .write_end = nobh_write_end,
+ .write_end = jfs_write_end,
.bmap = jfs_bmap,
.direct_IO = jfs_direct_IO,
};
@@ -399,7 +411,7 @@ void jfs_truncate(struct inode *ip)
{
jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size);
- nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block);
+ block_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block);
IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
jfs_truncate_nolock(ip, ip->i_size);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 387652ae14c2..2e8461ce74de 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -618,7 +618,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
SetPageUptodate(page);
} else {
page = read_mapping_page(mapping, page_index, NULL);
- if (IS_ERR(page) || !PageUptodate(page)) {
+ if (IS_ERR(page)) {
jfs_err("read_mapping_page failed!");
return NULL;
}
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index e6f4ccc12f49..94ab1dcd80e7 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -6490,6 +6490,7 @@ int smb2_write(struct ksmbd_work *work)
goto out;
}
+ ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
writethrough = true;
@@ -6505,10 +6506,6 @@ int smb2_write(struct ksmbd_work *work)
data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
le16_to_cpu(req->DataOffset));
- ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
- if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
- writethrough = true;
-
ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
fp->filp->f_path.dentry, offset, length);
err = ksmbd_vfs_write(work, fp, data_buf, length, &offset,
@@ -7703,7 +7700,7 @@ int smb2_ioctl(struct ksmbd_work *work)
{
struct file_zero_data_information *zero_data;
struct ksmbd_file *fp;
- loff_t off, len;
+ loff_t off, len, bfz;
if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
ksmbd_debug(SMB,
@@ -7720,19 +7717,26 @@ int smb2_ioctl(struct ksmbd_work *work)
zero_data =
(struct file_zero_data_information *)&req->Buffer[0];
- fp = ksmbd_lookup_fd_fast(work, id);
- if (!fp) {
- ret = -ENOENT;
+ off = le64_to_cpu(zero_data->FileOffset);
+ bfz = le64_to_cpu(zero_data->BeyondFinalZero);
+ if (off > bfz) {
+ ret = -EINVAL;
goto out;
}
- off = le64_to_cpu(zero_data->FileOffset);
- len = le64_to_cpu(zero_data->BeyondFinalZero) - off;
+ len = bfz - off;
+ if (len) {
+ fp = ksmbd_lookup_fd_fast(work, id);
+ if (!fp) {
+ ret = -ENOENT;
+ goto out;
+ }
- ret = ksmbd_vfs_zero_data(work, fp, off, len);
- ksmbd_fd_put(work, fp);
- if (ret < 0)
- goto out;
+ ret = ksmbd_vfs_zero_data(work, fp, off, len);
+ ksmbd_fd_put(work, fp);
+ if (ret < 0)
+ goto out;
+ }
break;
}
case FSCTL_QUERY_ALLOCATED_RANGES:
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index d035e060c2f0..35b55ee94fe5 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -5,16 +5,6 @@
*
* Author(s): Long Li <longli@microsoft.com>,
* Hyunchul Lee <hyc.lee@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#define SUBMOD_NAME "smb_direct"
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 8fef9de787d3..143bba4e4db8 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -230,7 +230,7 @@ static int ksmbd_kthread_fn(void *p)
break;
}
ret = kernel_accept(iface->ksmbd_socket, &client_sk,
- O_NONBLOCK);
+ SOCK_NONBLOCK);
mutex_unlock(&iface->sock_release_lock);
if (ret) {
if (ret == -EAGAIN)
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index dcdd07c6efff..5d185564aef6 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1015,7 +1015,9 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
off, len);
- return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len);
+ return vfs_fallocate(fp->filp,
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
+ off, len);
}
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
@@ -1046,7 +1048,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
*out_count = 0;
end = start + length;
while (start < end && *out_count < in_count) {
- extent_start = f->f_op->llseek(f, start, SEEK_DATA);
+ extent_start = vfs_llseek(f, start, SEEK_DATA);
if (extent_start < 0) {
if (extent_start != -ENXIO)
ret = (int)extent_start;
@@ -1056,7 +1058,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
if (extent_start >= end)
break;
- extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE);
+ extent_end = vfs_llseek(f, extent_start, SEEK_HOLE);
if (extent_end < 0) {
if (extent_end != -ENXIO)
ret = (int)extent_end;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0d25f44f5707..8326ff8a7a96 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -75,26 +75,28 @@ static struct bio *mpage_bio_submit(struct bio *bio)
* them. So when the buffer is up to date and the page size == block size,
* this marks the page up to date instead of adding new buffers.
*/
-static void
-map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
+static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
+ int page_block)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct buffer_head *page_bh, *head;
int block = 0;
- if (!page_has_buffers(page)) {
+ head = folio_buffers(folio);
+ if (!head) {
/*
* don't make any buffers if there is only one buffer on
- * the page and the page just needs to be set up to date
+ * the folio and the folio just needs to be set up to date
*/
if (inode->i_blkbits == PAGE_SHIFT &&
buffer_uptodate(bh)) {
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
return;
}
- create_empty_buffers(page, i_blocksize(inode), 0);
+ create_empty_buffers(&folio->page, i_blocksize(inode), 0);
+ head = folio_buffers(folio);
}
- head = page_buffers(page);
+
page_bh = head;
do {
if (block == page_block) {
@@ -110,7 +112,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
struct mpage_readpage_args {
struct bio *bio;
- struct page *page;
+ struct folio *folio;
unsigned int nr_pages;
bool is_readahead;
sector_t last_block_in_bio;
@@ -130,8 +132,8 @@ struct mpage_readpage_args {
*/
static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
{
- struct page *page = args->page;
- struct inode *inode = page->mapping->host;
+ struct folio *folio = args->folio;
+ struct inode *inode = folio->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
@@ -148,17 +150,20 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
int op = REQ_OP_READ;
unsigned nblocks;
unsigned relative_block;
- gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
+
+ /* MAX_BUF_PER_PAGE, for example */
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
if (args->is_readahead) {
op |= REQ_RAHEAD;
gfp |= __GFP_NORETRY | __GFP_NOWARN;
}
- if (page_has_buffers(page))
+ if (folio_buffers(folio))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
+ block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + args->nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
@@ -191,9 +196,9 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
}
/*
- * Then do more get_blocks calls until we are done with this page.
+ * Then do more get_blocks calls until we are done with this folio.
*/
- map_bh->b_page = page;
+ map_bh->b_page = &folio->page;
while (page_block < blocks_per_page) {
map_bh->b_state = 0;
map_bh->b_size = 0;
@@ -216,12 +221,12 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
/* some filesystems will copy data into the page during
* the get_block call, in which case we don't want to
- * read it again. map_buffer_to_page copies the data
- * we just collected from get_block into the page's buffers
- * so readpage doesn't have to repeat the get_block call
+ * read it again. map_buffer_to_folio copies the data
+ * we just collected from get_block into the folio's buffers
+ * so read_folio doesn't have to repeat the get_block call
*/
if (buffer_uptodate(map_bh)) {
- map_buffer_to_page(page, map_bh, page_block);
+ map_buffer_to_folio(folio, map_bh, page_block);
goto confused;
}
@@ -246,18 +251,18 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
}
if (first_hole != blocks_per_page) {
- zero_user_segment(page, first_hole << blkbits, PAGE_SIZE);
+ folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
if (first_hole == 0) {
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
goto out;
}
} else if (fully_mapped) {
- SetPageMappedToDisk(page);
+ folio_set_mappedtodisk(folio);
}
/*
- * This page will go to BIO. Do we need to send this BIO off first?
+ * This folio will go to BIO. Do we need to send this BIO off first?
*/
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
args->bio = mpage_bio_submit(args->bio);
@@ -266,7 +271,7 @@ alloc_new:
if (args->bio == NULL) {
if (first_hole == blocks_per_page) {
if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
- page))
+ &folio->page))
goto out;
}
args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), op,
@@ -277,7 +282,7 @@ alloc_new:
}
length = first_hole << blkbits;
- if (bio_add_page(args->bio, page, length, 0) < length) {
+ if (!bio_add_folio(args->bio, folio, length, 0)) {
args->bio = mpage_bio_submit(args->bio);
goto alloc_new;
}
@@ -295,10 +300,10 @@ out:
confused:
if (args->bio)
args->bio = mpage_bio_submit(args->bio);
- if (!PageUptodate(page))
- block_read_full_folio(page_folio(page), args->get_block);
+ if (!folio_test_uptodate(folio))
+ block_read_full_folio(folio, args->get_block);
else
- unlock_page(page);
+ folio_unlock(folio);
goto out;
}
@@ -343,18 +348,17 @@ confused:
*/
void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
{
- struct page *page;
+ struct folio *folio;
struct mpage_readpage_args args = {
.get_block = get_block,
.is_readahead = true,
};
- while ((page = readahead_page(rac))) {
- prefetchw(&page->flags);
- args.page = page;
+ while ((folio = readahead_folio(rac))) {
+ prefetchw(&folio->flags);
+ args.folio = folio;
args.nr_pages = readahead_count(rac);
args.bio = do_mpage_readpage(&args);
- put_page(page);
}
if (args.bio)
mpage_bio_submit(args.bio);
@@ -367,13 +371,11 @@ EXPORT_SYMBOL(mpage_readahead);
int mpage_read_folio(struct folio *folio, get_block_t get_block)
{
struct mpage_readpage_args args = {
- .page = &folio->page,
+ .folio = folio,
.nr_pages = 1,
.get_block = get_block,
};
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
args.bio = do_mpage_readpage(&args);
if (args.bio)
mpage_bio_submit(args.bio);
@@ -402,7 +404,6 @@ struct mpage_data {
struct bio *bio;
sector_t last_block_in_bio;
get_block_t *get_block;
- unsigned use_writepage;
};
/*
@@ -622,15 +623,10 @@ confused:
if (bio)
bio = mpage_bio_submit(bio);
- if (mpd->use_writepage) {
- ret = mapping->a_ops->writepage(page, wbc);
- } else {
- ret = -EAGAIN;
- goto out;
- }
/*
* The caller has a ref on the inode, so *mapping is stable
*/
+ ret = block_write_full_page(page, mpd->get_block, wbc);
mapping_set_error(mapping, ret);
out:
mpd->bio = bio;
@@ -642,8 +638,6 @@ out:
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @get_block: the filesystem's block mapper function.
- * If this is NULL then use a_ops->writepage. Otherwise, go
- * direct-to-BIO.
*
* This is a library function, which implements the writepages()
* address_space_operation.
@@ -660,42 +654,17 @@ int
mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block)
{
+ struct mpage_data mpd = {
+ .get_block = get_block,
+ };
struct blk_plug plug;
int ret;
blk_start_plug(&plug);
-
- if (!get_block)
- ret = generic_writepages(mapping, wbc);
- else {
- struct mpage_data mpd = {
- .bio = NULL,
- .last_block_in_bio = 0,
- .get_block = get_block,
- .use_writepage = 1,
- };
-
- ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(mpd.bio);
- }
- blk_finish_plug(&plug);
- return ret;
-}
-EXPORT_SYMBOL(mpage_writepages);
-
-int mpage_writepage(struct page *page, get_block_t get_block,
- struct writeback_control *wbc)
-{
- struct mpage_data mpd = {
- .bio = NULL,
- .last_block_in_bio = 0,
- .get_block = get_block,
- .use_writepage = 0,
- };
- int ret = __mpage_writepage(page, wbc, &mpd);
+ ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
if (mpd.bio)
mpage_bio_submit(mpd.bio);
+ blk_finish_plug(&plug);
return ret;
}
-EXPORT_SYMBOL(mpage_writepage);
+EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2d72b1b7ed74..549baed76351 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -533,9 +533,7 @@ const struct address_space_operations nfs_file_aops = {
.write_end = nfs_write_end,
.invalidate_folio = nfs_invalidate_folio,
.release_folio = nfs_release_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = nfs_migrate_page,
-#endif
+ .migrate_folio = nfs_migrate_folio,
.launder_folio = nfs_launder_folio,
.is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8f8cd6e2d4db..437ebe544aaf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -578,8 +578,10 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
#endif
#ifdef CONFIG_MIGRATION
-extern int nfs_migrate_page(struct address_space *,
- struct page *, struct page *, enum migrate_mode);
+int nfs_migrate_folio(struct address_space *, struct folio *dst,
+ struct folio *src, enum migrate_mode);
+#else
+#define nfs_migrate_folio NULL
#endif
static inline int
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5a9b043662e9..8ae2c8d1219d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -120,12 +120,8 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
SetPageError(page);
if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- struct address_space *mapping = page_file_mapping(page);
-
if (PageUptodate(page))
nfs_fscache_write_page(inode, page);
- else if (!PageError(page) && !PagePrivate(page))
- generic_error_remove_page(mapping, page);
unlock_page(page);
}
nfs_release_request(req);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1c706465d090..69569696dde0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -2119,27 +2119,27 @@ out_error:
}
#ifdef CONFIG_MIGRATION
-int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
+int nfs_migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
{
/*
- * If PagePrivate is set, then the page is currently associated with
+ * If the private flag is set, the folio is currently associated with
* an in-progress read or write request. Don't try to migrate it.
*
* FIXME: we could do this in principle, but we'll need a way to ensure
* that we can safely release the inode reference while holding
- * the page lock.
+ * the folio lock.
*/
- if (PagePrivate(page))
+ if (folio_test_private(src))
return -EBUSY;
- if (PageFsCache(page)) {
+ if (folio_test_fscache(src)) {
if (mode == MIGRATE_ASYNC)
return -EBUSY;
- wait_on_page_fscache(page);
+ folio_wait_fscache(src);
}
- return migrate_page(mapping, newpage, page, mode);
+ return migrate_folio(mapping, dst, src, mode);
}
#endif
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 840e3af63a6f..1b09d7293bc5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1173,6 +1173,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
nfsd_copy_write_verifier(verf, nn);
err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
since);
+ err = nfserrno(err2);
break;
case -EINVAL:
err = nfserr_notsupp;
@@ -1180,8 +1181,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
default:
nfsd_reset_write_verifier(nn);
trace_nfsd_writeverf_reset(nn, rqstp, err2);
+ err = nfserrno(err2);
}
- err = nfserrno(err2);
} else
nfsd_copy_write_verifier(verf, nn);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f8f4c2ff52f4..decd6471300b 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -194,7 +194,7 @@ static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
if (!IS_ERR(page)) {
kmap(page);
if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !nilfs_check_page(page))
+ if (!nilfs_check_page(page))
goto fail;
}
}
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 1344f7d475d3..aecda4fc95f5 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -198,6 +198,9 @@ static inline int nilfs_acl_chmod(struct inode *inode)
static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
{
+ if (S_ISLNK(inode->i_mode))
+ return 0;
+
inode->i_mode &= ~current_umask();
return 0;
}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a8e88cc38e16..3267e96c256c 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -294,57 +294,57 @@ repeat:
void nilfs_copy_back_pages(struct address_space *dmap,
struct address_space *smap)
{
- struct pagevec pvec;
+ struct folio_batch fbatch;
unsigned int i, n;
- pgoff_t index = 0;
+ pgoff_t start = 0;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
repeat:
- n = pagevec_lookup(&pvec, smap, &index);
+ n = filemap_get_folios(smap, &start, ~0UL, &fbatch);
if (!n)
return;
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i], *dpage;
- pgoff_t offset = page->index;
-
- lock_page(page);
- dpage = find_lock_page(dmap, offset);
- if (dpage) {
- /* overwrite existing page in the destination cache */
- WARN_ON(PageDirty(dpage));
- nilfs_copy_page(dpage, page, 0);
- unlock_page(dpage);
- put_page(dpage);
- /* Do we not need to remove page from smap here? */
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i], *dfolio;
+ pgoff_t index = folio->index;
+
+ folio_lock(folio);
+ dfolio = filemap_lock_folio(dmap, index);
+ if (dfolio) {
+ /* overwrite existing folio in the destination cache */
+ WARN_ON(folio_test_dirty(dfolio));
+ nilfs_copy_page(&dfolio->page, &folio->page, 0);
+ folio_unlock(dfolio);
+ folio_put(dfolio);
+ /* Do we not need to remove folio from smap here? */
} else {
- struct page *p;
+ struct folio *f;
- /* move the page to the destination cache */
+ /* move the folio to the destination cache */
xa_lock_irq(&smap->i_pages);
- p = __xa_erase(&smap->i_pages, offset);
- WARN_ON(page != p);
+ f = __xa_erase(&smap->i_pages, index);
+ WARN_ON(folio != f);
smap->nrpages--;
xa_unlock_irq(&smap->i_pages);
xa_lock_irq(&dmap->i_pages);
- p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS);
- if (unlikely(p)) {
+ f = __xa_store(&dmap->i_pages, index, folio, GFP_NOFS);
+ if (unlikely(f)) {
/* Probably -ENOMEM */
- page->mapping = NULL;
- put_page(page);
+ folio->mapping = NULL;
+ folio_put(folio);
} else {
- page->mapping = dmap;
+ folio->mapping = dmap;
dmap->nrpages++;
- if (PageDirty(page))
- __xa_set_mark(&dmap->i_pages, offset,
+ if (folio_test_dirty(folio))
+ __xa_set_mark(&dmap->i_pages, index,
PAGECACHE_TAG_DIRTY);
}
xa_unlock_irq(&dmap->i_pages);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
goto repeat;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 9e3964ea2ea0..5f4fb6ca6f2e 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1659,7 +1659,7 @@ const struct address_space_operations ntfs_normal_aops = {
.dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
.bmap = ntfs_bmap,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_compressed_aops = {
.writepage = ntfs_writepage,
.dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -1688,7 +1688,7 @@ const struct address_space_operations ntfs_mst_aops = {
.writepage = ntfs_writepage, /* Write dirty page to disk. */
.dirty_folio = filemap_dirty_folio,
#endif /* NTFS_RW */
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 934d5f79b9e7..0cac5458c023 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -74,13 +74,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping,
{
struct page *page = read_mapping_page(mapping, index, NULL);
- if (!IS_ERR(page)) {
+ if (!IS_ERR(page))
kmap(page);
- if (!PageError(page))
- return page;
- ntfs_unmap_page(page);
- return ERR_PTR(-EIO);
- }
return page;
}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index a8abe2296514..3436d58fbb74 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -219,11 +219,6 @@ do_non_resident_extend:
err = PTR_ERR(page);
goto init_err_out;
}
- if (unlikely(PageError(page))) {
- put_page(page);
- err = -EIO;
- goto init_err_out;
- }
/*
* Update the initialized size in the ntfs inode. This is
* enough to make ntfs_writepage() work.
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index e8c00dda42ad..fc0623b029e6 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -1173,7 +1173,7 @@ int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type,
{
struct ntfs_sb_info *sbi = ni->mi.sbi;
u8 cluster_bits = sbi->cluster_bits;
- CLST vcn = from >> cluster_bits;
+ CLST vcn;
CLST vcn_last = (to - 1) >> cluster_bits;
CLST lcn, clen;
int err;
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index aa184407520f..e3b5680fd516 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -1333,9 +1333,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
if (!new_free)
return -ENOMEM;
- if (new_free != wnd->free_bits)
- memcpy(new_free, wnd->free_bits,
- wnd->nwnd * sizeof(short));
+ memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short));
memset(new_free + wnd->nwnd, 0,
(new_wnd - wnd->nwnd) * sizeof(short));
kfree(wnd->free_bits);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 8e9d2b35175f..e579a9a878fb 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -989,7 +989,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
if (bytes > count)
bytes = count;
- frame = pos >> frame_bits;
frame_vbo = pos & ~(frame_size - 1);
index = frame_vbo >> PAGE_SHIFT;
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 18842998c8fa..3576268ee0a1 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -7,6 +7,7 @@
#include <linux/fiemap.h>
#include <linux/fs.h>
+#include <linux/minmax.h>
#include <linux/vmalloc.h>
#include "debug.h"
@@ -649,6 +650,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
struct mft_inode *mi;
u32 asize, free;
struct MFT_REF ref;
+ struct MFT_REC *mrec;
__le16 id;
if (!ni->attr_list.dirty)
@@ -692,11 +694,17 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
free -= asize;
}
+ /* Make a copy of primary record to restore if error. */
+ mrec = kmemdup(ni->mi.mrec, sbi->record_size, GFP_NOFS);
+ if (!mrec)
+ return 0; /* Not critical. */
+
/* It seems that attribute list can be removed from primary record. */
mi_remove_attr(NULL, &ni->mi, attr_list);
/*
- * Repeat the cycle above and move all attributes to primary record.
+ * Repeat the cycle above and copy all attributes to primary record.
+ * Do not remove original attributes from subrecords!
* It should be success!
*/
le = NULL;
@@ -707,14 +715,14 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
mi = ni_find_mi(ni, ino_get(&le->ref));
if (!mi) {
/* Should never happened, 'cause already checked. */
- goto bad;
+ goto out;
}
attr = mi_find_attr(mi, NULL, le->type, le_name(le),
le->name_len, &le->id);
if (!attr) {
/* Should never happened, 'cause already checked. */
- goto bad;
+ goto out;
}
asize = le32_to_cpu(attr->size);
@@ -724,18 +732,33 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
le16_to_cpu(attr->name_off));
if (!attr_ins) {
/*
- * Internal error.
- * Either no space in primary record (already checked).
- * Either tried to insert another
- * non indexed attribute (logic error).
+ * No space in primary record (already checked).
*/
- goto bad;
+ goto out;
}
/* Copy all except id. */
id = attr_ins->id;
memcpy(attr_ins, attr, asize);
attr_ins->id = id;
+ }
+
+ /*
+ * Repeat the cycle above and remove all attributes from subrecords.
+ */
+ le = NULL;
+ while ((le = al_enumerate(ni, le))) {
+ if (!memcmp(&le->ref, &ref, sizeof(ref)))
+ continue;
+
+ mi = ni_find_mi(ni, ino_get(&le->ref));
+ if (!mi)
+ continue;
+
+ attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ le->name_len, &le->id);
+ if (!attr)
+ continue;
/* Remove from original record. */
mi_remove_attr(NULL, mi, attr);
@@ -748,11 +771,13 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
ni->attr_list.le = NULL;
ni->attr_list.dirty = false;
+ kfree(mrec);
+ return 0;
+out:
+ /* Restore primary record. */
+ swap(mrec, ni->mi.mrec);
+ kfree(mrec);
return 0;
-bad:
- ntfs_inode_err(&ni->vfs_inode, "Internal error");
- make_bad_inode(&ni->vfs_inode);
- return -EINVAL;
}
/*
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 49b7df616778..e7c494005122 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -3843,6 +3843,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
memset(&rst_info2, 0, sizeof(struct restart_info));
err = log_read_rst(log, l_size, false, &rst_info2);
+ if (err)
+ goto out;
/* Determine which restart area to use. */
if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn)
@@ -5057,7 +5059,7 @@ undo_action_next:
goto add_allocated_vcns;
vcn = le64_to_cpu(lrh->target_vcn);
- vcn &= ~(log->clst_per_page - 1);
+ vcn &= ~(u64)(log->clst_per_page - 1);
add_allocated_vcns:
for (i = 0, vcn = le64_to_cpu(lrh->target_vcn),
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 6f81e3a49abf..84ccc1409874 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1042,19 +1042,16 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
{
int err;
struct NTFS_DE *e;
- const struct INDEX_HDR *hdr;
struct indx_node *node;
if (!root)
root = indx_get_root(&ni->dir, ni, NULL, NULL);
if (!root) {
- err = -EINVAL;
- goto out;
+ /* Should not happen. */
+ return -EINVAL;
}
- hdr = &root->ihdr;
-
/* Check cache. */
e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de;
if (e && !de_is_last(e) &&
@@ -1068,39 +1065,35 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
fnd_clear(fnd);
/* Lookup entry that is <= to the search value. */
- e = hdr_find_e(indx, hdr, key, key_len, ctx, diff);
+ e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff);
if (!e)
return -EINVAL;
fnd->root_de = e;
- err = 0;
for (;;) {
node = NULL;
- if (*diff >= 0 || !de_has_vcn_ex(e)) {
- *entry = e;
- goto out;
- }
+ if (*diff >= 0 || !de_has_vcn_ex(e))
+ break;
/* Read next level. */
err = indx_read(indx, ni, de_get_vbn(e), &node);
if (err)
- goto out;
+ return err;
/* Lookup entry that is <= to the search value. */
e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx,
diff);
if (!e) {
- err = -EINVAL;
put_indx_node(node);
- goto out;
+ return -EINVAL;
}
fnd_push(fnd, node, e);
}
-out:
- return err;
+ *entry = e;
+ return 0;
}
int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni,
@@ -1994,7 +1987,7 @@ static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni,
const struct NTFS_DE *e, bool trim)
{
int err;
- struct indx_node *n;
+ struct indx_node *n = NULL;
struct INDEX_HDR *hdr;
CLST vbn = de_get_vbn(e);
size_t i;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index be4ebdd8048b..b1ac6000c079 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -430,6 +430,7 @@ end_enum:
} else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) &&
fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) {
/* Records in $Extend are not a files or general directories. */
+ inode->i_op = &ntfs_file_inode_operations;
} else {
err = -EINVAL;
goto out;
@@ -851,12 +852,10 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
static int ntfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct inode *inode = mapping->host;
- struct ntfs_inode *ni = ntfs_i(inode);
/* Redirect call to 'ntfs_writepage' for resident files. */
- get_block_t *get_block = is_resident(ni) ? NULL : &ntfs_get_block;
-
- return mpage_writepages(mapping, wbc, get_block);
+ if (is_resident(ntfs_i(mapping->host)))
+ return generic_writepages(mapping, wbc);
+ return mpage_writepages(mapping, wbc, ntfs_get_block);
}
static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 8de129a6419b..f28726c4e845 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -896,13 +896,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping,
{
struct page *page = read_mapping_page(mapping, index, NULL);
- if (!IS_ERR(page)) {
+ if (!IS_ERR(page))
kmap(page);
- if (!PageError(page))
- return page;
- ntfs_unmap_page(page);
- return ERR_PTR(-EIO);
- }
return page;
}
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 861e35791506..8fe0a876400a 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -445,12 +445,11 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
attr = NULL;
while ((attr = mi_enum_attr(mi, attr))) {
diff = compare_attr(attr, type, name, name_len, upcase);
- if (diff > 0)
- break;
+
if (diff < 0)
continue;
- if (!is_attr_indexed(attr))
+ if (!diff && !is_attr_indexed(attr))
return NULL;
break;
}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 0c6de6287737..b41d7c824a50 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -30,6 +30,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/log2.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/nls.h>
#include <linux/seq_file.h>
@@ -390,7 +391,7 @@ static int ntfs_fs_reconfigure(struct fs_context *fc)
return -EINVAL;
}
- memcpy(sbi->options, new_opts, sizeof(*new_opts));
+ swap(sbi->options, fc->fs_private);
return 0;
}
@@ -900,6 +901,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.high = 0;
sbi->sb = sb;
+ sbi->options = fc->fs_private;
+ fc->fs_private = NULL;
sb->s_flags |= SB_NODIRATIME;
sb->s_magic = 0x7366746e; // "ntfs"
sb->s_op = &ntfs_sops;
@@ -1262,8 +1265,6 @@ load_root:
goto put_inode_out;
}
- fc->fs_private = NULL;
-
return 0;
put_inode_out:
@@ -1416,7 +1417,6 @@ static int ntfs_init_fs_context(struct fs_context *fc)
mutex_init(&sbi->compress.mtx_lzx);
#endif
- sbi->options = opts;
fc->s_fs_info = sbi;
ok:
fc->fs_private = opts;
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 5e0e0280e70d..2d29afe4b40b 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -706,13 +706,13 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_default_acl = NULL;
}
- if (!acl)
- inode->i_acl = NULL;
- else {
+ if (acl) {
if (!err)
err = ntfs_set_acl_ex(mnt_userns, inode, acl,
ACL_TYPE_ACCESS, true);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return err;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 35d40a67204c..1d489003f99d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -277,16 +277,14 @@ out:
static int ocfs2_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- loff_t start = (loff_t)page->index << PAGE_SHIFT;
+ loff_t start = folio_pos(folio);
int ret, unlock = 1;
- trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
- (page ? page->index : 0));
+ trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index);
- ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
+ ret = ocfs2_inode_lock_with_page(inode, NULL, 0, &folio->page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
@@ -296,11 +294,11 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio)
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
/*
- * Unlock the page and cycle ip_alloc_sem so that we don't
+ * Unlock the folio and cycle ip_alloc_sem so that we don't
* busyloop waiting for ip_alloc_sem to unlock
*/
ret = AOP_TRUNCATED_PAGE;
- unlock_page(page);
+ folio_unlock(folio);
unlock = 0;
down_read(&oi->ip_alloc_sem);
up_read(&oi->ip_alloc_sem);
@@ -313,21 +311,21 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio)
* block_read_full_folio->get_block freaks out if it is asked to read
* beyond the end of a file, so we check here. Callers
* (generic_file_read, vm_ops->fault) are clever enough to check i_size
- * and notice that the page they just read isn't needed.
+ * and notice that the folio they just read isn't needed.
*
* XXX sys_readahead() seems to get that wrong?
*/
if (start >= i_size_read(inode)) {
- zero_user(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
ret = 0;
goto out_alloc;
}
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- ret = ocfs2_readpage_inline(inode, page);
+ ret = ocfs2_readpage_inline(inode, &folio->page);
else
- ret = block_read_full_folio(page_folio(page), ocfs2_get_block);
+ ret = block_read_full_folio(folio, ocfs2_get_block);
unlock = 0;
out_alloc:
@@ -336,7 +334,7 @@ out_inode_unlock:
ocfs2_inode_unlock(inode, 0);
out:
if (unlock)
- unlock_page(page);
+ folio_unlock(folio);
return ret;
}
@@ -2464,7 +2462,7 @@ const struct address_space_operations ocfs2_aops = {
.direct_IO = ocfs2_direct_IO,
.invalidate_folio = block_invalidate_folio,
.release_folio = ocfs2_release_folio,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7497cd592258..9c67edd215d5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (status)
return status;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
status = dquot_initialize(inode);
if (status)
return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 337527571461..740b64238312 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -277,7 +277,6 @@ enum ocfs2_mount_options
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
- OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
{
- return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)
- || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER));
+ return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
}
static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index e04358a46b68..1358981e80a3 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3146,48 +3146,18 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters)
{
- int ret = 0;
- loff_t offset, end, map_end;
- pgoff_t page_index;
- struct page *page;
+ int ret;
+ loff_t start, end;
if (ocfs2_should_order_data(inode))
return 0;
- offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
- end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+ start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1;
- ret = filemap_fdatawrite_range(inode->i_mapping,
- offset, end - 1);
- if (ret < 0) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret < 0)
mlog_errno(ret);
- return ret;
- }
-
- while (offset < end) {
- page_index = offset >> PAGE_SHIFT;
- map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
- if (map_end > end)
- map_end = end;
-
- page = find_or_create_page(inode->i_mapping,
- page_index, GFP_NOFS);
- BUG_ON(!page);
-
- wait_on_page_writeback(page);
- if (PageError(page)) {
- ret = -EIO;
- mlog_errno(ret);
- } else
- mark_page_accessed(page);
-
- unlock_page(page);
- put_page(page);
- page = NULL;
- offset = map_end;
- if (ret)
- break;
- }
return ret;
}
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 0b0ae3ebb0cf..da7718cef735 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
int i, ret = -ENOSPC;
if ((preferred >= 0) && (preferred < si->si_num_slots)) {
- if (!si->si_slots[preferred].sl_valid ||
- !si->si_slots[preferred].sl_node_num) {
+ if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
- if (!si->si_slots[i].sl_valid ||
- !si->si_slots[i].sl_node_num) {
+ if (!si->si_slots[i].sl_valid) {
ret = i;
break;
}
@@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- if (ocfs2_mount_local(osb))
- /* use slot 0 directly in local mode */
- slot = 0;
- else {
- /* search for ourselves first and take the slot if it already
- * exists. Perhaps we need to mark this in a variable for our
- * own journal recovery? Possibly not, though we certainly
- * need to warn to the user */
- slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ /* search for ourselves first and take the slot if it already
+ * exists. Perhaps we need to mark this in a variable for our
+ * own journal recovery? Possibly not, though we certainly
+ * need to warn to the user */
+ slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ if (slot < 0) {
+ /* if no slot yet, then just take 1st available
+ * one. */
+ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot < 0) {
- /* if no slot yet, then just take 1st available
- * one. */
- slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
- if (slot < 0) {
- spin_unlock(&osb->osb_lock);
- mlog(ML_ERROR, "no free slots available!\n");
- status = -EINVAL;
- goto bail;
- }
- } else
- printk(KERN_INFO "ocfs2: Slot %d on device (%s) was "
- "already allocated to this node!\n",
- slot, osb->dev_str);
- }
+ spin_unlock(&osb->osb_lock);
+ mlog(ML_ERROR, "no free slots available!\n");
+ status = -EINVAL;
+ goto bail;
+ }
+ } else
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f7298816d8d9..438be028935d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -172,7 +172,6 @@ enum {
Opt_dir_resv_level,
Opt_journal_async_commit,
Opt_err_cont,
- Opt_nocluster,
Opt_err,
};
@@ -206,7 +205,6 @@ static const match_table_t tokens = {
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_err_cont, "errors=continue"},
- {Opt_nocluster, "nocluster"},
{Opt_err, NULL}
};
@@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
- tmp = OCFS2_MOUNT_NOCLUSTER;
- if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
- ret = -EINVAL;
- mlog(ML_ERROR, "Cannot change nocluster option on remount\n");
- goto out;
- }
-
tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
OCFS2_MOUNT_HB_NONE;
if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
@@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
}
if (ocfs2_userspace_stack(osb) &&
- !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
OCFS2_STACK_LABEL_LEN)) {
mlog(ML_ERROR,
@@ -1137,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
"ordered");
- if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
- !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT))
- printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted "
- "without cluster aware mode.\n", osb->dev_str);
-
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
@@ -1452,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_journal_async_commit:
mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
break;
- case Opt_nocluster:
- mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER;
- break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1566,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
seq_printf(s, ",journal_async_commit");
- if (opts & OCFS2_MOUNT_NOCLUSTER)
- seq_printf(s, ",nocluster");
-
return 0;
}
diff --git a/fs/open.c b/fs/open.c
index 1d57fbde2feb..2790aac66e58 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -663,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
return do_fchmodat(AT_FDCWD, filename, mode);
}
+/**
+ * setattr_vfsuid - check and set ia_fsuid attribute
+ * @kuid: new inode owner
+ *
+ * Check whether @kuid is valid and if so generate and set vfsuid_t in
+ * ia_vfsuid.
+ *
+ * Return: true if @kuid is valid, false if not.
+ */
+static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
+{
+ if (!uid_valid(kuid))
+ return false;
+ attr->ia_valid |= ATTR_UID;
+ attr->ia_vfsuid = VFSUIDT_INIT(kuid);
+ return true;
+}
+
+/**
+ * setattr_vfsgid - check and set ia_fsgid attribute
+ * @kgid: new inode owner
+ *
+ * Check whether @kgid is valid and if so generate and set vfsgid_t in
+ * ia_vfsgid.
+ *
+ * Return: true if @kgid is valid, false if not.
+ */
+static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
+{
+ if (!gid_valid(kgid))
+ return false;
+ attr->ia_valid |= ATTR_GID;
+ attr->ia_vfsgid = VFSGIDT_INIT(kgid);
+ return true;
+}
+
int chown_common(const struct path *path, uid_t user, gid_t group)
{
struct user_namespace *mnt_userns, *fs_userns;
@@ -678,28 +714,22 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
mnt_userns = mnt_user_ns(path->mnt);
fs_userns = i_user_ns(inode);
- uid = mapped_kuid_user(mnt_userns, fs_userns, uid);
- gid = mapped_kgid_user(mnt_userns, fs_userns, gid);
retry_deleg:
newattrs.ia_valid = ATTR_CTIME;
- if (user != (uid_t) -1) {
- if (!uid_valid(uid))
- return -EINVAL;
- newattrs.ia_valid |= ATTR_UID;
- newattrs.ia_uid = uid;
- }
- if (group != (gid_t) -1) {
- if (!gid_valid(gid))
- return -EINVAL;
- newattrs.ia_valid |= ATTR_GID;
- newattrs.ia_gid = gid;
- }
+ if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
+ return -EINVAL;
+ if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
+ return -EINVAL;
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
inode_lock(inode);
- error = security_path_chown(path, uid, gid);
+ /* Continue to send actual fs values, not the mount values. */
+ error = security_path_chown(
+ path,
+ from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid),
+ from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid));
if (!error)
error = notify_change(mnt_userns, path->dentry, &newattrs,
&delegated_inode);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 5ce27dde3c79..7a8c0c6e698d 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -307,7 +307,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
folio_size(folio), inode->i_size, NULL, NULL, file);
- /* this will only zero remaining unread portions of the page data */
+ /* this will only zero remaining unread portions of the folio data */
iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */
flush_dcache_folio(folio);
@@ -315,8 +315,6 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
folio_set_error(folio);
} else {
folio_mark_uptodate(folio);
- if (folio_test_error(folio))
- folio_clear_error(folio);
ret = 0;
}
/* unlock the folio after the ->read_folio() routine completes */
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 714ec569d25b..245e2cb62708 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -331,8 +331,8 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = stat->uid,
- .ia_gid = stat->gid,
+ .ia_vfsuid = VFSUIDT_INIT(stat->uid),
+ .ia_vfsgid = VFSGIDT_INIT(stat->gid),
};
err = ovl_do_notify_change(ofs, upperdentry, &attr);
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 4f34b7e02eee..e22e20f4811a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -139,17 +139,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs,
struct dentry *upperdentry,
struct iattr *attr)
{
- struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs);
- struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry));
-
- if (attr->ia_valid & ATTR_UID)
- attr->ia_uid = mapped_kuid_user(upper_mnt_userns,
- fs_userns, attr->ia_uid);
- if (attr->ia_valid & ATTR_GID)
- attr->ia_gid = mapped_kgid_user(upper_mnt_userns,
- fs_userns, attr->ia_gid);
-
- return notify_change(upper_mnt_userns, upperdentry, attr, NULL);
+ return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL);
}
static inline int ovl_do_rmdir(struct ovl_fs *ofs,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index eb815759842c..7d1c3114d496 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -276,7 +276,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
+ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 14658b009f1b..ffbadb8b3032 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -55,6 +55,7 @@ static void free_pstore_private(struct pstore_private *private)
return;
if (private->record) {
kfree(private->record->buf);
+ kfree(private->record->priv);
kfree(private->record);
}
kfree(private);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index e26162f102ff..b2fd3c20e7c2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -28,11 +28,14 @@
#include <linux/crypto.h>
#include <linux/string.h>
#include <linux/timer.h>
+#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/jiffies.h>
#include <linux/workqueue.h>
+#include <crypto/acompress.h>
+
#include "internal.h"
/*
@@ -90,7 +93,8 @@ module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
/* Compression parameters */
-static struct crypto_comp *tfm;
+static struct crypto_acomp *tfm;
+static struct acomp_req *creq;
struct pstore_zbackend {
int (*zbufsize)(size_t size);
@@ -268,12 +272,21 @@ static const struct pstore_zbackend zbackends[] = {
static int pstore_compress(const void *in, void *out,
unsigned int inlen, unsigned int outlen)
{
+ struct scatterlist src, dst;
int ret;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS))
return -EINVAL;
- ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
+ sg_init_table(&src, 1);
+ sg_set_buf(&src, in, inlen);
+
+ sg_init_table(&dst, 1);
+ sg_set_buf(&dst, out, outlen);
+
+ acomp_request_set_params(creq, &src, &dst, inlen, outlen);
+
+ ret = crypto_acomp_compress(creq);
if (ret) {
pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
return ret;
@@ -284,7 +297,7 @@ static int pstore_compress(const void *in, void *out,
static void allocate_buf_for_compression(void)
{
- struct crypto_comp *ctx;
+ struct crypto_acomp *acomp;
int size;
char *buf;
@@ -296,7 +309,7 @@ static void allocate_buf_for_compression(void)
if (!psinfo || tfm)
return;
- if (!crypto_has_comp(zbackend->name, 0, 0)) {
+ if (!crypto_has_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC)) {
pr_err("Unknown compression: %s\n", zbackend->name);
return;
}
@@ -315,16 +328,24 @@ static void allocate_buf_for_compression(void)
return;
}
- ctx = crypto_alloc_comp(zbackend->name, 0, 0);
- if (IS_ERR_OR_NULL(ctx)) {
+ acomp = crypto_alloc_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR_OR_NULL(acomp)) {
kfree(buf);
pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
- PTR_ERR(ctx));
+ PTR_ERR(acomp));
+ return;
+ }
+
+ creq = acomp_request_alloc(acomp);
+ if (!creq) {
+ crypto_free_acomp(acomp);
+ kfree(buf);
+ pr_err("acomp_request_alloc('%s') failed\n", zbackend->name);
return;
}
/* A non-NULL big_oops_buf indicates compression is available. */
- tfm = ctx;
+ tfm = acomp;
big_oops_buf_sz = size;
big_oops_buf = buf;
@@ -334,7 +355,8 @@ static void allocate_buf_for_compression(void)
static void free_buf_for_compression(void)
{
if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) {
- crypto_free_comp(tfm);
+ acomp_request_free(creq);
+ crypto_free_acomp(tfm);
tfm = NULL;
}
kfree(big_oops_buf);
@@ -671,6 +693,8 @@ static void decompress_record(struct pstore_record *record)
int ret;
int unzipped_len;
char *unzipped, *workspace;
+ struct acomp_req *dreq;
+ struct scatterlist src, dst;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed)
return;
@@ -694,16 +718,30 @@ static void decompress_record(struct pstore_record *record)
if (!workspace)
return;
+ dreq = acomp_request_alloc(tfm);
+ if (!dreq) {
+ kfree(workspace);
+ return;
+ }
+
+ sg_init_table(&src, 1);
+ sg_set_buf(&src, record->buf, record->size);
+
+ sg_init_table(&dst, 1);
+ sg_set_buf(&dst, workspace, unzipped_len);
+
+ acomp_request_set_params(dreq, &src, &dst, record->size, unzipped_len);
+
/* After decompression "unzipped_len" is almost certainly smaller. */
- ret = crypto_comp_decompress(tfm, record->buf, record->size,
- workspace, &unzipped_len);
+ ret = crypto_acomp_decompress(dreq);
if (ret) {
- pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
+ pr_err("crypto_acomp_decompress failed, ret = %d!\n", ret);
kfree(workspace);
return;
}
/* Append ECC notice to decompressed buffer. */
+ unzipped_len = dreq->dlen;
memcpy(workspace + unzipped_len, record->buf + record->size,
record->ecc_notice_size);
@@ -711,6 +749,7 @@ static void decompress_record(struct pstore_record *record)
unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size,
GFP_KERNEL);
kfree(workspace);
+ acomp_request_free(dreq);
if (!unzipped)
return;
@@ -769,6 +808,7 @@ void pstore_get_backend_records(struct pstore_info *psi,
if (rc) {
/* pstore_mkfile() did not take record, so free it. */
kfree(record->buf);
+ kfree(record->priv);
kfree(record);
if (rc != -EEXIST || !quiet)
failed++;
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 7c8f8feac6c3..017d0d4ad329 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -363,7 +363,7 @@ static int psz_kmsg_recover_data(struct psz_context *cxt)
rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf),
zone->off);
if (rcnt != zone->buffer_size + sizeof(*buf))
- return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ return rcnt < 0 ? rcnt : -EIO;
}
return 0;
}
@@ -372,7 +372,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt)
{
struct pstore_zone_info *info = cxt->pstore_zone_info;
struct pstore_zone *zone;
- size_t rcnt, len;
+ ssize_t rcnt, len;
struct psz_buffer *buf;
struct psz_kmsg_header *hdr;
struct timespec64 time = { };
@@ -400,7 +400,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt)
continue;
} else if (rcnt != len) {
pr_err("read %s with id %lu failed\n", zone->name, i);
- return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ return rcnt < 0 ? rcnt : -EIO;
}
if (buf->sig != zone->buffer->sig) {
@@ -502,7 +502,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone)
rcnt = info->read((char *)&tmpbuf, len, zone->off);
if (rcnt != len) {
pr_debug("read zone %s failed\n", zone->name);
- return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ return rcnt < 0 ? rcnt : -EIO;
}
if (tmpbuf.sig != zone->buffer->sig) {
@@ -544,7 +544,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone)
rcnt = info->read(buf, len - start, off + start);
if (rcnt != len - start) {
pr_err("read zone %s failed\n", zone->name);
- ret = (int)rcnt < 0 ? (int)rcnt : -EIO;
+ ret = rcnt < 0 ? rcnt : -EIO;
goto free_oldbuf;
}
@@ -552,7 +552,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone)
rcnt = info->read(buf + len - start, start, off);
if (rcnt != start) {
pr_err("read zone %s failed\n", zone->name);
- ret = (int)rcnt < 0 ? (int)rcnt : -EIO;
+ ret = rcnt < 0 ? rcnt : -EIO;
goto free_oldbuf;
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 09d1307959d0..28966da7834e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2085,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer);
/* Wrapper for transferring ownership of an inode for uid/gid only
* Called from FSXXX_setattr()
*/
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode,
+ struct iattr *iattr)
{
struct dquot *transfer_to[MAXQUOTAS] = {};
struct dquot *dquot;
@@ -2095,8 +2096,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
if (!dquot_active(inode))
return 0;
- if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
- dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
+ if (i_uid_needs_update(mnt_userns, iattr, inode)) {
+ kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsuid);
+
+ dquot = dqget(sb, make_kqid_uid(kuid));
if (IS_ERR(dquot)) {
if (PTR_ERR(dquot) != -ESRCH) {
ret = PTR_ERR(dquot);
@@ -2106,8 +2110,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
}
transfer_to[USRQUOTA] = dquot;
}
- if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
- dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
+ if (i_gid_needs_update(mnt_userns, iattr, inode)) {
+ kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsgid);
+
+ dquot = dqget(sb, make_kqid_gid(kgid));
if (IS_ERR(dquot)) {
if (PTR_ERR(dquot) != -ESRCH) {
ret = PTR_ERR(dquot);
diff --git a/fs/read_write.c b/fs/read_write.c
index b1b1cdfee9d3..a1f4d45c2406 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1649,7 +1649,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !((iocb->ki_flags & IOCB_DIRECT) ||
+ (file->f_mode & FMODE_BUF_WASYNC)))
return -EINVAL;
return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0cffe054b78e..1141053b96ed 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3284,7 +3284,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
/* must be turned off for recursive notify_change calls */
ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
error = dquot_initialize(inode);
if (error)
return error;
@@ -3367,7 +3367,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
reiserfs_write_unlock(inode->i_sb);
if (error)
goto out;
- error = dquot_transfer(inode, attr);
+ error = dquot_transfer(mnt_userns, inode, attr);
reiserfs_write_lock(inode->i_sb);
if (error) {
journal_end(&th);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index bd073836e141..436641369283 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -440,16 +440,9 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
*/
mapping_set_gfp_mask(mapping, GFP_NOFS);
page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
- if (!IS_ERR(page)) {
+ if (!IS_ERR(page))
kmap(page);
- if (PageError(page))
- goto fail;
- }
return page;
-
-fail:
- reiserfs_put_page(page);
- return ERR_PTR(-EIO);
}
static inline __u32 xattr_hash(const char *msg, int len)
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e112b5424cdb..f1a3795812ce 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -148,16 +148,7 @@ static int generic_remap_check_len(struct inode *inode_in,
/* Read a page's worth of file data into the page cache. */
static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
{
- struct folio *folio;
-
- folio = read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
- if (IS_ERR(folio))
- return folio;
- if (!folio_test_uptodate(folio)) {
- folio_put(folio);
- return ERR_PTR(-EIO);
- }
- return folio;
+ return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
}
/*
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index a8e495d8eb86..7f0904b20329 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -454,7 +454,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
int expected = index == file_end ?
(i_size_read(inode) & (msblk->block_size - 1)) :
msblk->block_size;
- int res;
+ int res = 0;
void *pageaddr;
TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
@@ -467,14 +467,15 @@ static int squashfs_read_folio(struct file *file, struct folio *folio)
if (index < file_end || squashfs_i(inode)->fragment_block ==
SQUASHFS_INVALID_BLK) {
u64 block = 0;
- int bsize = read_blocklist(inode, index, &block);
- if (bsize < 0)
+
+ res = read_blocklist(inode, index, &block);
+ if (res < 0)
goto error_out;
- if (bsize == 0)
+ if (res == 0)
res = squashfs_readpage_sparse(page, expected);
else
- res = squashfs_readpage_block(page, block, bsize, expected);
+ res = squashfs_readpage_block(page, block, res, expected);
} else
res = squashfs_readpage_fragment(page, expected);
@@ -488,11 +489,11 @@ out:
memset(pageaddr, 0, PAGE_SIZE);
kunmap_atomic(pageaddr);
flush_dcache_page(page);
- if (!PageError(page))
+ if (res == 0)
SetPageUptodate(page);
unlock_page(page);
- return 0;
+ return res;
}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 04ced154960f..f2353dd676ef 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1461,29 +1461,6 @@ static bool ubifs_dirty_folio(struct address_space *mapping,
return ret;
}
-#ifdef CONFIG_MIGRATION
-static int ubifs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
-{
- int rc;
-
- rc = migrate_page_move_mapping(mapping, newpage, page, 0);
- if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
-
- if (PagePrivate(page)) {
- detach_page_private(page);
- attach_page_private(newpage, (void *)1);
- }
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
- return MIGRATEPAGE_SUCCESS;
-}
-#endif
-
static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
{
struct inode *inode = folio->mapping->host;
@@ -1649,10 +1626,8 @@ const struct address_space_operations ubifs_file_address_operations = {
.write_end = ubifs_write_end,
.invalidate_folio = ubifs_invalidate_folio,
.dirty_folio = ubifs_dirty_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = ubifs_migrate_page,
-#endif
- .release_folio = ubifs_release_folio,
+ .migrate_folio = filemap_migrate_folio,
+ .release_folio = ubifs_release_folio,
};
const struct inode_operations ubifs_file_inode_operations = {
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index b721d0bda5e5..391efaf1d528 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -193,7 +193,7 @@ static struct page *ufs_get_page(struct inode *dir, unsigned long n)
if (!IS_ERR(page)) {
kmap(page);
if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !ufs_check_page(page))
+ if (!ufs_check_page(page))
goto fail;
}
}
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 4fa633f84274..08ddf41eaaad 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -264,17 +264,6 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
put_page(page);
return NULL;
}
-
- if (!PageUptodate(page) || PageError(page)) {
- unlock_page(page);
- put_page(page);
-
- printk(KERN_ERR "ufs_change_blocknr: "
- "can not read page: ino %lu, index: %lu\n",
- inode->i_ino, index);
-
- return ERR_PTR(-EIO);
- }
}
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << inode->i_blkbits, 0);
diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c
index bc1a7c8b5c8d..baf1d7eda0a5 100644
--- a/fs/unicode/mkutf8data.c
+++ b/fs/unicode/mkutf8data.c
@@ -486,6 +486,16 @@ static void tree_walk(struct tree *tree)
nodes, leaves, singletons);
}
+static void *xmalloc(size_t size)
+{
+ void *p = malloc(size);
+
+ if (p)
+ return p;
+ fprintf(stderr, "Out of memory.\n");
+ exit(1);
+}
+
/*
* Allocate an initialize a new internal node.
*/
@@ -494,7 +504,7 @@ static struct node *alloc_node(struct node *parent)
struct node *node;
int bitnum;
- node = malloc(sizeof(*node));
+ node = xmalloc(sizeof(*node));
node->left = node->right = NULL;
node->parent = parent;
node->leftnode = NODE;
@@ -2159,7 +2169,7 @@ static void nfdi_init(void)
}
mapping[i++] = 0;
- um = malloc(i * sizeof(unsigned int));
+ um = xmalloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
@@ -2215,7 +2225,7 @@ static void nfdicf_init(void)
}
mapping[i++] = 0;
- um = malloc(i * sizeof(unsigned int));
+ um = xmalloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
@@ -2256,11 +2266,11 @@ static void ignore_init(void)
line_fail(prop_name, line);
for (unichar = first; unichar <= last; unichar++) {
free(unicode_data[unichar].utf32nfdi);
- um = malloc(sizeof(unsigned int));
+ um = xmalloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdi = um;
free(unicode_data[unichar].utf32nfdicf);
- um = malloc(sizeof(unsigned int));
+ um = xmalloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdicf = um;
count++;
@@ -2277,11 +2287,11 @@ static void ignore_init(void)
if (!utf32valid(unichar))
line_fail(prop_name, line);
free(unicode_data[unichar].utf32nfdi);
- um = malloc(sizeof(unsigned int));
+ um = xmalloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdi = um;
free(unicode_data[unichar].utf32nfdicf);
- um = malloc(sizeof(unsigned int));
+ um = xmalloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdicf = um;
if (verbose > 1)
@@ -2359,7 +2369,7 @@ static void corrections_init(void)
}
mapping[i++] = 0;
- um = malloc(i * sizeof(unsigned int));
+ um = xmalloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
corrections[count].utf32nfdi = um;
@@ -2459,12 +2469,12 @@ static void hangul_decompose(void)
mapping[i++] = 0;
assert(!unicode_data[unichar].utf32nfdi);
- um = malloc(i * sizeof(unsigned int));
+ um = xmalloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
assert(!unicode_data[unichar].utf32nfdicf);
- um = malloc(i * sizeof(unsigned int));
+ um = xmalloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
@@ -2473,7 +2483,7 @@ static void hangul_decompose(void)
* decompositions must not be stored in the generated
* trie.
*/
- unicode_data[unichar].utf8nfdi = malloc(2);
+ unicode_data[unichar].utf8nfdi = xmalloc(2);
unicode_data[unichar].utf8nfdi[0] = HANGUL;
unicode_data[unichar].utf8nfdi[1] = '\0';
@@ -2523,13 +2533,13 @@ static void nfdi_decompose(void)
if (ret)
break;
free(unicode_data[unichar].utf32nfdi);
- um = malloc(i * sizeof(unsigned int));
+ um = xmalloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
}
/* Add this decomposition to nfdicf if there is no entry. */
if (!unicode_data[unichar].utf32nfdicf) {
- um = malloc(i * sizeof(unsigned int));
+ um = xmalloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
}
@@ -2577,7 +2587,7 @@ static void nfdicf_decompose(void)
if (ret)
break;
free(unicode_data[unichar].utf32nfdicf);
- um = malloc(i * sizeof(unsigned int));
+ um = xmalloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8ec38b25187b..5d1a995b15f8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -570,7 +570,7 @@ const struct address_space_operations xfs_address_space_operations = {
.invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
- .migratepage = iomap_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = xfs_iomap_swapfile_activate,
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 135d44133477..6ee905a09eb2 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -455,6 +455,8 @@ static inline void
xfs_attr_free_item(
struct xfs_attr_intent *attr)
{
+ ASSERT(attr->xattri_leaf_bp == NULL);
+
if (attr->xattri_da_state)
xfs_da_state_free(attr->xattri_da_state);
xfs_attri_log_nameval_put(attr->xattri_nameval);
@@ -509,6 +511,10 @@ xfs_attr_cancel_item(
struct xfs_attr_intent *attr;
attr = container_of(item, struct xfs_attr_intent, xattri_list);
+ if (attr->xattri_leaf_bp) {
+ xfs_buf_relse(attr->xattri_leaf_bp);
+ attr->xattri_leaf_bp = NULL;
+ }
xfs_attr_free_item(attr);
}
@@ -576,7 +582,7 @@ xfs_attri_item_recover(
struct xfs_trans_res tres;
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
- int error, ret = 0;
+ int error;
int total;
int local;
struct xfs_attrd_log_item *done_item = NULL;
@@ -655,13 +661,31 @@ xfs_attri_item_recover(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- ret = xfs_xattri_finish_update(attr, done_item);
- if (ret == -EAGAIN) {
- /* There's more work to do, so add it to this transaction */
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error == -EAGAIN) {
+ /*
+ * There's more work to do, so add the intent item to this
+ * transaction so that we can continue it later.
+ */
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
- } else
- error = ret;
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ if (error)
+ goto out_unlock;
+ /*
+ * The defer capture structure took its own reference to the
+ * attr leaf buffer and will give that to the continuation
+ * transaction. The attr intent struct drives the continuation
+ * work, so release our refcount on the attr leaf buffer but
+ * retain the pointer in the intent structure.
+ */
+ if (attr->xattri_leaf_bp)
+ xfs_buf_relse(attr->xattri_leaf_bp);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ return 0;
+ }
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
@@ -670,14 +694,15 @@ xfs_attri_item_recover(
error = xfs_defer_ops_capture_and_commit(tp, capture_list);
out_unlock:
- if (attr->xattri_leaf_bp)
+ if (attr->xattri_leaf_bp) {
xfs_buf_relse(attr->xattri_leaf_bp);
+ attr->xattri_leaf_bp = NULL;
+ }
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
out:
- if (ret != -EAGAIN)
- xfs_attr_free_item(attr);
+ xfs_attr_free_item(attr);
return error;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5a171c0b244b..8d9b14d2b912 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -410,7 +410,7 @@ restart:
spin_unlock(&ip->i_flags_lock);
out:
- return file_modified(file);
+ return kiocb_modified(iocb);
}
static int
@@ -700,12 +700,11 @@ xfs_file_buffered_write(
bool cleared_space = false;
unsigned int iolock;
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EOPNOTSUPP;
-
write_retry:
iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
@@ -1165,7 +1164,7 @@ xfs_file_open(
{
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
return generic_file_open(inode, file);
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 5269354b1b69..2609825d53ee 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -440,7 +440,7 @@ xfs_inodegc_queue_all(
for_each_online_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
}
}
@@ -1841,8 +1841,8 @@ void
xfs_inodegc_worker(
struct work_struct *work)
{
- struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc,
- work);
+ struct xfs_inodegc *gc = container_of(to_delayed_work(work),
+ struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
@@ -1862,19 +1862,29 @@ xfs_inodegc_worker(
}
/*
- * Force all currently queued inode inactivation work to run immediately and
- * wait for the work to finish.
+ * Expedite all pending inodegc work to run immediately. This does not wait for
+ * completion of the work.
*/
void
-xfs_inodegc_flush(
+xfs_inodegc_push(
struct xfs_mount *mp)
{
if (!xfs_is_inodegc_enabled(mp))
return;
+ trace_xfs_inodegc_push(mp, __return_address);
+ xfs_inodegc_queue_all(mp);
+}
+/*
+ * Force all currently queued inode inactivation work to run immediately and
+ * wait for the work to finish.
+ */
+void
+xfs_inodegc_flush(
+ struct xfs_mount *mp)
+{
+ xfs_inodegc_push(mp);
trace_xfs_inodegc_flush(mp, __return_address);
-
- xfs_inodegc_queue_all(mp);
flush_workqueue(mp->m_inodegc_wq);
}
@@ -2014,6 +2024,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
+ unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
spin_lock(&ip->i_flags_lock);
@@ -2025,19 +2036,26 @@ xfs_inodegc_queue(
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
- put_cpu_ptr(gc);
- if (!xfs_is_inodegc_enabled(mp))
+ /*
+ * We queue the work while holding the current CPU so that the work
+ * is scheduled to run on this CPU.
+ */
+ if (!xfs_is_inodegc_enabled(mp)) {
+ put_cpu_ptr(gc);
return;
-
- if (xfs_inodegc_want_queue_work(ip, items)) {
- trace_xfs_inodegc_queue(mp, __return_address);
- queue_work(mp->m_inodegc_wq, &gc->work);
}
+ if (xfs_inodegc_want_queue_work(ip, items))
+ queue_delay = 0;
+
+ trace_xfs_inodegc_queue(mp, __return_address);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
+ put_cpu_ptr(gc);
+
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
- flush_work(&gc->work);
+ flush_delayed_work(&gc->work);
}
}
@@ -2054,7 +2072,7 @@ xfs_inodegc_cpu_dead(
unsigned int count = 0;
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
- cancel_work_sync(&dead_gc->work);
+ cancel_delayed_work_sync(&dead_gc->work);
if (llist_empty(&dead_gc->list))
return;
@@ -2073,12 +2091,12 @@ xfs_inodegc_cpu_dead(
llist_add_batch(first, last, &gc->list);
count += READ_ONCE(gc->items);
WRITE_ONCE(gc->items, count);
- put_cpu_ptr(gc);
if (xfs_is_inodegc_enabled(mp)) {
trace_xfs_inodegc_queue(mp, __return_address);
- queue_work(mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
}
+ put_cpu_ptr(gc);
}
/*
@@ -2173,7 +2191,7 @@ xfs_inodegc_shrinker_scan(
unsigned int h = READ_ONCE(gc->shrinker_hits);
WRITE_ONCE(gc->shrinker_hits, h + 1);
- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
no_items = false;
}
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 2e4cfddf8b8e..6cd180721659 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
void xfs_blockgc_start(struct xfs_mount *mp);
void xfs_inodegc_worker(struct work_struct *work);
+void xfs_inodegc_push(struct xfs_mount *mp);
void xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 52d6f2c7d58b..3e1c62ffa4f7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -132,6 +132,26 @@ xfs_ilock_attr_map_shared(
}
/*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
+ * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
+ * to set in lock_flags.
+ */
+static inline void
+xfs_lock_flags_assert(
+ uint lock_flags)
+{
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+ (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ ASSERT(lock_flags != 0);
+}
+
+/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
* multi-reader locks: invalidate_lock and the i_lock. This routine allows
* various combinations of the locks to be obtained.
@@ -168,18 +188,7 @@ xfs_ilock(
{
trace_xfs_ilock(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
down_write_nested(&VFS_I(ip)->i_rwsem,
@@ -222,18 +231,7 @@ xfs_ilock_nowait(
{
trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
@@ -291,19 +289,7 @@ xfs_iunlock(
xfs_inode_t *ip,
uint lock_flags)
{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
- ASSERT(lock_flags != 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
@@ -379,8 +365,8 @@ xfs_isilocked(
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
- (lock_flags & XFS_IOLOCK_SHARED));
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
+ (lock_flags & XFS_MMAPLOCK_SHARED));
}
if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5a393259a3a3..5d50fed291b4 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -664,7 +664,7 @@ xfs_ilock_for_iomap(
unsigned flags,
unsigned *lockmode)
{
- unsigned mode = XFS_ILOCK_SHARED;
+ unsigned int mode = *lockmode;
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
/*
@@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin(
bool eof = false, cow_eof = false, shared = false;
int allocfork = XFS_DATA_FORK;
int error = 0;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
@@ -1172,7 +1175,7 @@ xfs_read_iomap_begin(
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
bool shared = false;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 29f5b8b8aca6..a7402f6ea510 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -667,13 +667,15 @@ xfs_setattr_nonsize(
uint qflags = 0;
if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
- uid = iattr->ia_uid;
+ uid = from_vfsuid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsuid);
qflags |= XFS_QMOPT_UQUOTA;
} else {
uid = inode->i_uid;
}
if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
- gid = iattr->ia_gid;
+ gid = from_vfsgid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsgid);
qflags |= XFS_QMOPT_GQUOTA;
} else {
gid = inode->i_gid;
@@ -704,13 +706,13 @@ xfs_setattr_nonsize(
* didn't have the inode locked, inode's dquot(s) would have changed
* also.
*/
- if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) &&
- !uid_eq(inode->i_uid, iattr->ia_uid)) {
+ if (XFS_IS_UQUOTA_ON(mp) &&
+ i_uid_needs_update(mnt_userns, iattr, inode)) {
ASSERT(udqp);
old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
}
- if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) &&
- !gid_eq(inode->i_gid, iattr->ia_gid)) {
+ if (XFS_IS_GQUOTA_ON(mp) &&
+ i_gid_needs_update(mnt_userns, iattr, inode)) {
ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
ASSERT(gdqp);
old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ba5d42abf66e..d2eaebd85abf 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -61,7 +61,7 @@ struct xfs_error_cfg {
*/
struct xfs_inodegc {
struct llist_head list;
- struct work_struct work;
+ struct delayed_work work;
/* approximate count of inodes in the list */
unsigned int items;
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 74ac9ca9e119..392cb39cc10c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -454,9 +454,12 @@ xfs_qm_scall_getquota(
struct xfs_dquot *dqp;
int error;
- /* Flush inodegc work at the start of a quota reporting scan. */
+ /*
+ * Expedite pending inodegc work at the start of a quota reporting
+ * scan but don't block waiting for it to complete.
+ */
if (id == 0)
- xfs_inodegc_flush(mp);
+ xfs_inodegc_push(mp);
/*
* Try to get the dquot. We don't want it allocated on disk, so don't
@@ -498,7 +501,7 @@ xfs_qm_scall_getquota_next(
/* Flush inodegc work at the start of a quota reporting scan. */
if (*id == 0)
- xfs_inodegc_flush(mp);
+ xfs_inodegc_push(mp);
error = xfs_qm_dqget_next(mp, *id, type, &dqp);
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ed18160e6181..aa977c7ea370 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -797,8 +797,11 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
- /* Wait for whatever inactivations are in progress. */
- xfs_inodegc_flush(mp);
+ /*
+ * Expedite background inodegc but don't wait. We do not want to block
+ * here waiting hours for a billion extent file to be truncated.
+ */
+ xfs_inodegc_push(mp);
statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
@@ -1074,7 +1077,7 @@ xfs_inodegc_init_percpu(
gc = per_cpu_ptr(mp->m_inodegc, cpu);
init_llist_head(&gc->list);
gc->items = 0;
- INIT_WORK(&gc->work, xfs_inodegc_worker);
+ INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
}
return 0;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d32026585c1b..0fa1b7a2918c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \
TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
TP_ARGS(mp, caller_ip))
DEFINE_FS_EVENT(xfs_inodegc_flush);
+DEFINE_FS_EVENT(xfs_inodegc_push);
DEFINE_FS_EVENT(xfs_inodegc_start);
DEFINE_FS_EVENT(xfs_inodegc_stop);
DEFINE_FS_EVENT(xfs_inodegc_queue);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 053299758deb..08c2f0102460 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -271,7 +271,7 @@ static const struct address_space_operations zonefs_file_aops = {
.dirty_folio = filemap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
- .migratepage = iomap_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.direct_IO = noop_direct_IO,
@@ -616,7 +616,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
!uid_eq(iattr->ia_uid, inode->i_uid)) ||
((iattr->ia_valid & ATTR_GID) &&
!gid_eq(iattr->ia_gid, inode->i_gid))) {
- ret = dquot_transfer(inode, iattr);
+ ret = dquot_transfer(mnt_userns, inode, iattr);
if (ret)
return ret;
}