diff options
Diffstat (limited to 'fs')
329 files changed, 8863 insertions, 21669 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 1c8dc696d516..cebba4eaa0b5 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -62,12 +62,12 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) version = cpu_to_le32(v9inode->qid.version); path = cpu_to_le64(v9inode->qid.path); v9ses = v9fs_inode2v9ses(inode); - v9inode->netfs_ctx.cache = + v9inode->netfs.cache = fscache_acquire_cookie(v9fs_session_cache(v9ses), 0, &path, sizeof(path), &version, sizeof(version), - i_size_read(&v9inode->vfs_inode)); + i_size_read(&v9inode->netfs.inode)); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 79df61fe0e59..baf2b152229e 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -152,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, const unsigned char **wnames, *uname; int i, n, l, clone, access; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *old_fid = NULL; + struct p9_fid *fid, *old_fid; v9ses = v9fs_dentry2v9ses(dentry); access = v9ses->flags & V9FS_ACCESS_MASK; @@ -194,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, if (IS_ERR(fid)) return fid; + refcount_inc(&fid->count); v9fs_fid_add(dentry->d_sb->s_root, fid); } /* If we are root ourself just return that */ - if (dentry->d_sb->s_root == dentry) { - refcount_inc(&fid->count); + if (dentry->d_sb->s_root == dentry) return fid; - } /* * Do a multipath walk with attached root. * When walking parent we need to make sure we @@ -212,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, fid = ERR_PTR(n); goto err_out; } + old_fid = fid; clone = 1; i = 0; while (i < n) { @@ -221,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, * walk to ensure none of the patch component change */ fid = p9_client_walk(fid, l, &wnames[i], clone); + /* non-cloning walk will return the same fid */ + if (fid != old_fid) { + p9_client_clunk(old_fid); + old_fid = fid; + } if (IS_ERR(fid)) { - if (old_fid) { - /* - * If we fail, clunk fid which are mapping - * to path component and not the last component - * of the path. - */ - p9_client_clunk(old_fid); - } kfree(wnames); goto err_out; } - old_fid = fid; i += l; clone = 0; } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e28ddf763b3b..0129de2ea31a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -625,7 +625,7 @@ static void v9fs_inode_init_once(void *foo) struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; memset(&v9inode->qid, 0, sizeof(v9inode->qid)); - inode_init_once(&v9inode->vfs_inode); + inode_init_once(&v9inode->netfs.inode); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index ec0e8df3b2eb..6acabc2e7dc9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -109,11 +109,7 @@ struct v9fs_session_info { #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; @@ -122,13 +118,13 @@ struct v9fs_inode { static inline struct v9fs_inode *V9FS_I(const struct inode *inode) { - return container_of(inode, struct v9fs_inode, vfs_inode); + return container_of(inode, struct v9fs_inode, netfs.inode); } static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) { #ifdef CONFIG_9P_FSCACHE - return netfs_i_cookie(&v9inode->vfs_inode); + return netfs_i_cookie(&v9inode->netfs); #else return NULL; #endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 8ce82ff1e40a..d0833fa69faf 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -58,21 +58,33 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { + struct inode *inode = file_inode(file); + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid = file->private_data; + BUG_ON(!fid); + + /* we might need to read from a fid that was opened write-only + * for read-modify-write of page cache, use the writeback fid + * for that */ + if (rreq->origin == NETFS_READ_FOR_WRITE && + (fid->mode & O_ACCMODE) == O_WRONLY) { + fid = v9inode->writeback_fid; + BUG_ON(!fid); + } + refcount_inc(&fid->count); rreq->netfs_priv = fid; return 0; } /** - * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request - * @mapping: unused mapping of request to cleanup - * @priv: private data to cleanup, a fid, guaranted non-null. + * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq + * @rreq: The I/O request to clean up */ -static void v9fs_req_cleanup(struct address_space *mapping, void *priv) +static void v9fs_free_request(struct netfs_io_request *rreq) { - struct p9_fid *fid = priv; + struct p9_fid *fid = rreq->netfs_priv; p9_client_clunk(fid); } @@ -94,9 +106,9 @@ static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) const struct netfs_request_ops v9fs_req_ops = { .init_request = v9fs_init_request, + .free_request = v9fs_free_request, .begin_cache_operation = v9fs_begin_cache_operation, .issue_read = v9fs_issue_read, - .cleanup = v9fs_req_cleanup, }; /** @@ -140,7 +152,7 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, transferred_or_error != -ENOBUFS) { version = cpu_to_le32(v9inode->qid.version); fscache_invalidate(v9fs_inode_cookie(v9inode), &version, - i_size_read(&v9inode->vfs_inode), 0); + i_size_read(&v9inode->netfs.inode), 0); } } @@ -274,7 +286,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - retval = netfs_write_begin(filp, mapping, pos, len, &folio, fsdata); + retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata); if (retval < 0) return retval; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 55367ecb9442..3d8297714772 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -234,7 +234,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; mutex_init(&v9inode->v_mutex); - return &v9inode->vfs_inode; + return &v9inode->netfs.inode; } /** @@ -252,7 +252,8 @@ void v9fs_free_inode(struct inode *inode) */ static void v9fs_set_netfs_context(struct inode *inode) { - netfs_i_context_init(inode, &v9fs_req_ops); + struct v9fs_inode *v9inode = V9FS_I(inode); + netfs_inode_init(&v9inode->netfs, &v9fs_req_ops); } int v9fs_init_inode(struct v9fs_session_info *v9ses, @@ -1250,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); v9ses = v9fs_dentry2v9ses(dentry); - fid = v9fs_fid_lookup(dentry); + if (!v9fs_proto_dotu(v9ses)) + return ERR_PTR(-EBADF); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); - if (!v9fs_proto_dotu(v9ses)) - return ERR_PTR(-EBADF); - st = p9_client_stat(fid); p9_client_clunk(fid); if (IS_ERR(st)) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index d17502a738a9..b6eb1160296c 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_client_clunk(dfid); goto out; } @@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", err); + p9_client_clunk(dfid); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), @@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", err); + p9_client_clunk(dfid); goto error; } v9fs_invalidate_inode_attr(dir); diff --git a/fs/Makefile b/fs/Makefile index 208a74e0b00e..93b80529f8e8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o -obj-$(CONFIG_IO_URING) += io_uring.o -obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ diff --git a/fs/affs/file.c b/fs/affs/file.c index cd00a4c68a12..cefa222f7881 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -526,7 +526,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create) struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; - char *data; unsigned pos = 0; u32 bidx, boff, bsize; u32 tmp; @@ -545,15 +544,12 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); - data = kmap_atomic(page); - memcpy(data + pos, AFFS_DATA(bh) + boff, tmp); - kunmap_atomic(data); + memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; pos += tmp; boff = 0; } - flush_dcache_page(page); return 0; } diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1b4d5809808d..a484fa642808 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work) { struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work); - unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false); + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); } void afs_server_init_callback_work(struct work_struct *work) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 79f6b74336d2..56ae5cd5184f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -109,7 +109,7 @@ struct afs_lookup_cookie { */ static void afs_dir_read_cleanup(struct afs_read *req) { - struct address_space *mapping = req->vnode->vfs_inode.i_mapping; + struct address_space *mapping = req->vnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; @@ -153,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio, block = kmap_local_folio(folio, offset); if (block->hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n", - __func__, dvnode->vfs_inode.i_ino, + __func__, dvnode->netfs.inode.i_ino, pos, offset, size, ntohs(block->hdr.magic)); trace_afs_dir_check_failed(dvnode, pos + offset, i_size); kunmap_local(block); @@ -183,7 +183,7 @@ error: static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) { union afs_xdr_dir_block *block; - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; size_t offset, size; @@ -217,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) */ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) { - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; int ret = 0; @@ -269,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file) static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) __acquires(&dvnode->validate_lock) { - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct afs_read *req; loff_t i_size; int nr_pages, i; @@ -287,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) req->cleanup = afs_dir_read_cleanup; expand: - i_size = i_size_read(&dvnode->vfs_inode); + i_size = i_size_read(&dvnode->netfs.inode); if (i_size < 2048) { ret = afs_bad(dvnode, afs_file_error_dir_small); goto error; @@ -305,7 +305,7 @@ expand: req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages, + iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages, 0, i_size); req->iter = &req->def_iter; @@ -897,7 +897,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, out_op: if (op->error == 0) { - inode = &op->file[1].vnode->vfs_inode; + inode = &op->file[1].vnode->netfs.inode; op->file[1].vnode = NULL; } @@ -1139,7 +1139,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version); + ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1170,7 +1170,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _debug("%pd: file deleted (uq %u -> %u I:%u)", dentry, fid.unique, vnode->fid.unique, - vnode->vfs_inode.i_generation); + vnode->netfs.inode.i_generation); goto not_found; } goto out_valid; @@ -1368,7 +1368,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) if (d_really_is_positive(dentry)) { struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - clear_nlink(&vnode->vfs_inode); + clear_nlink(&vnode->netfs.inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -1487,8 +1487,8 @@ static void afs_dir_remove_link(struct afs_operation *op) /* Already done */ } else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { write_seqlock(&vnode->cb_lock); - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); __afs_break_callback(vnode, afs_cb_break_for_unlink); } @@ -1504,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op) op->error = ret; } - _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error); + _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error); } static void afs_unlink_success(struct afs_operation *op) @@ -1680,8 +1680,8 @@ static void afs_link_success(struct afs_operation *op) afs_update_dentry_version(op, dvp, op->dentry); if (op->dentry_2->d_parent == op->dentry->d_parent) afs_update_dentry_version(op, dvp, op->dentry_2); - ihold(&vp->vnode->vfs_inode); - d_instantiate(op->dentry, &vp->vnode->vfs_inode); + ihold(&vp->vnode->netfs.inode); + d_instantiate(op->dentry, &vp->vnode->netfs.inode); } static void afs_link_put(struct afs_operation *op) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index d98e109ecee9..0ab7752d1b75 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, */ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio *folio; folio = __filemap_get_folio(mapping, index, @@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, _enter(",,{%d,%s},", name->len, name->name); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -336,7 +336,7 @@ found_space: if (b < AFS_DIR_BLOCKS_WITH_CTR) meta->meta.alloc_ctrs[b] -= need_slots; - inode_inc_iversion_raw(&vnode->vfs_inode); + inode_inc_iversion_raw(&vnode->netfs.inode); afs_stat_v(vnode, n_dir_cr); _debug("Insert %s in %u[%u]", name->name, b, slot); @@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, _enter(",,{%d,%s},", name->len, name->name); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (i_size < AFS_DIR_BLOCK_SIZE || i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { @@ -463,7 +463,7 @@ found_dirent: if (b < AFS_DIR_BLOCKS_WITH_CTR) meta->meta.alloc_ctrs[b] += need_slots; - inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); afs_stat_v(vnode, n_dir_rm); _debug("Remove %s from %u[%u]", name->name, b, slot); diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 45cfd50a9521..bb5807e87fa4 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, goto out; } while (!d_is_negative(sdentry)); - ihold(&vnode->vfs_inode); + ihold(&vnode->netfs.inode); ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key); switch (ret) { @@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, d_drop(sdentry); } - iput(&vnode->vfs_inode); + iput(&vnode->netfs.inode); dput(sdentry); out: _leave(" = %d", ret); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index f120bcb8bf73..d7d9402ff718 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); - netfs_i_context_init(inode, NULL); + netfs_inode_init(&vnode->netfs, NULL); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { diff --git a/fs/afs/file.c b/fs/afs/file.c index a8e8832179e4..d1cfb235c4b9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -194,7 +194,7 @@ int afs_release(struct inode *inode, struct file *file) afs_put_wb_key(af->wb); if ((file->f_mode & FMODE_WRITE)) { - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); afs_set_cache_aux(vnode, &aux); fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size); } else { @@ -325,7 +325,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->iter = &fsreq->def_iter; iov_iter_xarray(&fsreq->def_iter, READ, - &fsreq->vnode->vfs_inode.i_mapping->i_pages, + &fsreq->vnode->netfs.inode.i_mapping->i_pages, fsreq->pos, fsreq->len); afs_fetch_data(fsreq->vnode, fsreq); @@ -375,24 +375,24 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq) } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; } -static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) +static void afs_free_request(struct netfs_io_request *rreq) { - key_put(netfs_priv); + key_put(rreq->netfs_priv); } const struct netfs_request_ops afs_req_ops = { .init_request = afs_init_request, + .free_request = afs_free_request, .begin_cache_operation = afs_begin_cache_operation, .check_write_begin = afs_check_write_begin, .issue_read = afs_issue_read, - .cleanup = afs_priv_cleanup, }; int afs_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index d222dfbe976b..7a3803ce3a22 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op) if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); if (op->file[0].put_vnode) - iput(&op->file[0].vnode->vfs_inode); + iput(&op->file[0].vnode->netfs.inode); if (op->file[1].put_vnode) - iput(&op->file[1].vnode->vfs_inode); + iput(&op->file[1].vnode->netfs.inode); if (op->more_files) { for (i = 0; i < op->nr_files - 2; i++) if (op->more_files[i].put_vnode) - iput(&op->more_files[i].vnode->vfs_inode); + iput(&op->more_files[i].vnode->netfs.inode); kfree(op->more_files); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 30b066299d39..64dab70d4a4f 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops); + netfs_inode_init(&vnode->netfs, &afs_req_ops); } /* @@ -96,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_flags |= S_NOATIME; inode->i_uid = make_kuid(&init_user_ns, status->owner); inode->i_gid = make_kgid(&init_user_ns, status->group); - set_nlink(&vnode->vfs_inode, status->nlink); + set_nlink(&vnode->netfs.inode, status->nlink); switch (status->type) { case AFS_FTYPE_FILE: @@ -139,7 +139,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; - inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); + inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver @@ -163,7 +163,7 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; - struct inode *inode = &vnode->vfs_inode; + struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; bool data_changed = false; @@ -246,7 +246,7 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ - vnode->netfs_ctx.remote_i_size = status->size; + vnode->netfs.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); inode->i_ctime = t; @@ -289,7 +289,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v */ if (vp->scb.status.abort_code == VNOVNODE) { set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_nlink(&vnode->vfs_inode); + clear_nlink(&vnode->netfs.inode); __afs_break_callback(vnode, afs_cb_break_for_deleted); op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } @@ -306,8 +306,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v if (vp->scb.have_cb) afs_apply_callback(op, vp); } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) { - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); __afs_break_callback(vnode, afs_cb_break_for_deleted); } @@ -326,7 +326,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->vfs_inode.i_state & I_NEW) { + if (vnode->netfs.inode.i_state & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); op->error = ret; if (ret == 0) @@ -430,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct afs_vnode_cache_aux aux; if (vnode->status.type != AFS_FTYPE_FILE) { - vnode->netfs_ctx.cache = NULL; + vnode->netfs.cache = NULL; return; } @@ -457,7 +457,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) { struct afs_vnode_param *dvp = &op->file[0]; - struct super_block *sb = dvp->vnode->vfs_inode.i_sb; + struct super_block *sb = dvp->vnode->netfs.inode.i_sb; struct afs_vnode *vnode; struct inode *inode; int ret; @@ -582,10 +582,10 @@ static void afs_zap_data(struct afs_vnode *vnode) /* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a * directory or symlink */ - if (S_ISREG(vnode->vfs_inode.i_mode)) - invalidate_remote_inode(&vnode->vfs_inode); + if (S_ISREG(vnode->netfs.inode.i_mode)) + invalidate_remote_inode(&vnode->netfs.inode); else - invalidate_inode_pages2(vnode->vfs_inode.i_mapping); + invalidate_inode_pages2(vnode->netfs.inode.i_mapping); } /* @@ -683,8 +683,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) key_serial(key)); if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->vfs_inode.i_nlink) - clear_nlink(&vnode->vfs_inode); + if (vnode->netfs.inode.i_nlink) + clear_nlink(&vnode->netfs.inode); goto valid; } @@ -745,7 +745,8 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); - if (!(query_flags & AT_STATX_DONT_SYNC) && + if (vnode->volume && + !(query_flags & AT_STATX_DONT_SYNC) && !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) @@ -826,7 +827,7 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->vfs_inode; + struct inode *inode = &vp->vnode->netfs.inode; loff_t old_i_size = i_size_read(inode); op->setattr.old_i_size = old_i_size; @@ -843,7 +844,7 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->vfs_inode; + struct inode *inode = &vp->vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; @@ -875,7 +876,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH; struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - struct inode *inode = &vnode->vfs_inode; + struct inode *inode = &vnode->netfs.inode; loff_t i_size; int ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a30995901266..a6f25d9e75b5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -619,12 +619,7 @@ enum afs_lock_state { * leak from one inode to another. */ struct afs_vnode { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; - + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ @@ -675,7 +670,7 @@ struct afs_vnode { static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE - return netfs_i_cookie(&vnode->vfs_inode); + return netfs_i_cookie(&vnode->netfs); #else return NULL; #endif @@ -685,7 +680,7 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, struct fscache_cookie *cookie) { #ifdef CONFIG_AFS_FSCACHE - vnode->netfs_ctx.cache = cookie; + vnode->netfs.cache = cookie; #endif } @@ -892,7 +887,7 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl afs_set_cache_aux(vnode, &aux); fscache_invalidate(afs_vnode_cache(vnode), &aux, - i_size_read(&vnode->vfs_inode), flags); + i_size_read(&vnode->netfs.inode), flags); } /* @@ -1217,7 +1212,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode) static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { - return afs_i2net(&vnode->vfs_inode); + return afs_i2net(&vnode->netfs.inode); } static inline struct afs_net *afs_sock2net(struct sock *sk) @@ -1593,12 +1588,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *); */ static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { - return container_of(inode, struct afs_vnode, vfs_inode); + return container_of(inode, struct afs_vnode, netfs.inode); } static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) { - return &vnode->vfs_inode; + return &vnode->netfs.inode; } /* @@ -1621,8 +1616,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op, */ static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) { - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1; + i_size_write(&vnode->netfs.inode, size); + vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; } /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index bbb2c210d139..97f50e9fd9eb 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -132,12 +132,6 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (IS_ERR(page)) return PTR_ERR(page); - if (PageError(page)) { - ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); - put_page(page); - return ret; - } - buf = kmap(page); ret = -EINVAL; if (buf[size - 1] == '.') diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fea195b0b27..95d713074dc8 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -659,7 +659,7 @@ static void afs_i_init_once(void *_vnode) struct afs_vnode *vnode = _vnode; memset(vnode, 0, sizeof(*vnode)); - inode_init_once(&vnode->vfs_inode); + inode_init_once(&vnode->netfs.inode); mutex_init(&vnode->io_lock); init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); @@ -700,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb) init_rwsem(&vnode->rmdir_lock); INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work); - _leave(" = %p", &vnode->vfs_inode); - return &vnode->vfs_inode; + _leave(" = %p", &vnode->netfs.inode); + return &vnode->netfs.inode; } static void afs_free_inode(struct inode *inode) diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 94a3d247924b..cc665cef0abe 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -9,8 +9,7 @@ #include <linux/slab.h> #include "internal.h" -unsigned __read_mostly afs_volume_gc_delay = 10; -unsigned __read_mostly afs_volume_record_life = 60 * 60; +static unsigned __read_mostly afs_volume_record_life = 60 * 60; /* * Insert a volume into a cell. If there's an existing volume record, that is diff --git a/fs/afs/write.c b/fs/afs/write.c index 2236b2165e37..2c885b22de34 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -60,7 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - ret = netfs_write_begin(file, mapping, pos, len, &folio, fsdata); + ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); if (ret < 0) return ret; @@ -146,10 +146,10 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_end_pos = pos + copied; - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (write_end_pos > i_size) { write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (write_end_pos > i_size) afs_set_i_size(vnode, write_end_pos); write_sequnlock(&vnode->cb_lock); @@ -257,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t end; @@ -354,7 +354,6 @@ static const struct afs_operation_ops afs_store_data_operation = { static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, bool laundering) { - struct netfs_i_context *ictx = &vnode->netfs_ctx; struct afs_operation *op; struct afs_wb_key *wbk = NULL; loff_t size = iov_iter_count(iter); @@ -385,9 +384,9 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t op->store.write_iter = iter; op->store.pos = pos; op->store.size = size; - op->store.i_size = max(pos + size, ictx->remote_i_size); + op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); op->store.laundering = laundering; - op->mtime = vnode->vfs_inode.i_mtime; + op->mtime = vnode->netfs.inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; @@ -554,7 +553,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, struct iov_iter iter; unsigned long priv; unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->vfs_inode); + loff_t i_size = i_size_read(&vnode->netfs.inode); bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); long count = wbc->nr_to_write; @@ -845,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) _enter("{%llx:%llu},{%zu},", vnode->fid.vid, vnode->fid.vnode, count); - if (IS_SWAPFILE(&vnode->vfs_inode)) { + if (IS_SWAPFILE(&vnode->netfs.inode)) { printk(KERN_INFO "AFS: Attempt to write to active swap file!\n"); return -EBUSY; @@ -958,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) /* Discard unused keys */ spin_lock(&vnode->wb_lock); - if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) && - !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) { + if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) && + !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) { list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) { if (refcount_read(&wbk->usage) == 1) list_move(&wbk->vnode_link, &graveyard); @@ -1034,6 +1033,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode, bool caching) { fscache_write_to_cache(afs_vnode_cache(vnode), - vnode->vfs_inode.i_mapping, start, len, i_size, + vnode->netfs.inode.i_mapping, start, len, i_size, afs_write_to_cache_done, vnode, caching); } @@ -400,8 +400,8 @@ static const struct file_operations aio_ring_fops = { }; #if IS_ENABLED(CONFIG_MIGRATION) -static int aio_migratepage(struct address_space *mapping, struct page *new, - struct page *old, enum migrate_mode mode) +static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { struct kioctx *ctx; unsigned long flags; @@ -435,10 +435,10 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, goto out; } - idx = old->index; + idx = src->index; if (idx < (pgoff_t)ctx->nr_pages) { - /* Make sure the old page hasn't already been changed */ - if (ctx->ring_pages[idx] != old) + /* Make sure the old folio hasn't already been changed */ + if (ctx->ring_pages[idx] != &src->page) rc = -EAGAIN; } else rc = -EINVAL; @@ -447,27 +447,27 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, goto out_unlock; /* Writeback must be complete */ - BUG_ON(PageWriteback(old)); - get_page(new); + BUG_ON(folio_test_writeback(src)); + folio_get(dst); - rc = migrate_page_move_mapping(mapping, new, old, 1); + rc = folio_migrate_mapping(mapping, dst, src, 1); if (rc != MIGRATEPAGE_SUCCESS) { - put_page(new); + folio_put(dst); goto out_unlock; } /* Take completion_lock to prevent other writes to the ring buffer - * while the old page is copied to the new. This prevents new + * while the old folio is copied to the new. This prevents new * events from being lost. */ spin_lock_irqsave(&ctx->completion_lock, flags); - migrate_page_copy(new, old); - BUG_ON(ctx->ring_pages[idx] != old); - ctx->ring_pages[idx] = new; + folio_migrate_copy(dst, src); + BUG_ON(ctx->ring_pages[idx] != &src->page); + ctx->ring_pages[idx] = &dst->page; spin_unlock_irqrestore(&ctx->completion_lock, flags); - /* The old page is no longer accessible. */ - put_page(old); + /* The old folio is no longer accessible. */ + folio_put(src); out_unlock: mutex_unlock(&ctx->ring_lock); @@ -475,13 +475,13 @@ out: spin_unlock(&mapping->private_lock); return rc; } +#else +#define aio_migrate_folio NULL #endif static const struct address_space_operations aio_ctx_aops = { .dirty_folio = noop_dirty_folio, -#if IS_ENABLED(CONFIG_MIGRATION) - .migratepage = aio_migratepage, -#endif + .migrate_folio = aio_migrate_folio, }; static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) @@ -1475,7 +1475,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = iocb_flags(req->ki_filp); + req->ki_flags = req->ki_filp->f_iocb_flags; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/fs/attr.c b/fs/attr.c index 66899b6e9bd8..b5b8835ddf15 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -22,7 +22,7 @@ * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @uid: uid to chown @inode to + * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -31,15 +31,15 @@ * performed on the raw inode simply passs init_user_ns. */ static bool chown_ok(struct user_namespace *mnt_userns, - const struct inode *inode, - kuid_t uid) + const struct inode *inode, vfsuid_t ia_vfsuid) { - kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid)) + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && + vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (uid_eq(kuid, INVALID_UID) && + if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns, * chgrp_ok - verify permissions to chgrp inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @gid: gid to chown @inode to + * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -58,15 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns, * performed on the raw inode simply passs init_user_ns. */ static bool chgrp_ok(struct user_namespace *mnt_userns, - const struct inode *inode, kgid_t gid) + const struct inode *inode, vfsgid_t ia_vfsgid) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && - (in_group_p(gid) || gid_eq(gid, inode->i_gid))) - return true; + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { + if (vfsgid_eq(ia_vfsgid, vfsgid)) + return true; + if (vfsgid_in_group_p(ia_vfsgid)) + return true; + } if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (gid_eq(kgid, INVALID_GID) && + if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -114,21 +118,30 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid)) + if ((ia_valid & ATTR_UID) && + !chown_ok(mnt_userns, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid)) + if ((ia_valid & ATTR_GID) && + !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { + vfsgid_t vfsgid; + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; + + if (ia_valid & ATTR_GID) + vfsgid = attr->ia_vfsgid; + else + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + /* Also check the setgid bit! */ - if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - i_gid_into_mnt(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(vfsgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -205,9 +218,7 @@ EXPORT_SYMBOL(inode_newsize_ok); * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified - * in attr on idmapped mounts. If file ownership is changed setattr_copy - * doesn't map ia_uid and ia_gid. It will asssume the caller has already - * provided the intended values. Necessary permission checks to determine + * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex @@ -228,10 +239,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -240,8 +249,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (!in_group_p(kgid) && + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; @@ -292,9 +301,6 @@ EXPORT_SYMBOL(may_setattr); * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * - * If file ownership is changed notify_change() doesn't map ia_uid and - * ia_gid. It will asssume the caller has already provided the intended values. - * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding @@ -383,23 +389,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * namespace of the superblock. */ if (ia_valid & ATTR_UID && - !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid)) + !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && - !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) + !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && - !uid_valid(i_uid_into_mnt(mnt_userns, inode))) + !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; - error = security_inode_setattr(dentry, attr); + error = security_inode_setattr(mnt_userns, dentry, attr); if (error) return error; error = try_break_deleg(inode, delegated_inode); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index be383fa46b12..32749fcee090 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -108,8 +108,7 @@ static const struct export_operations befs_export_operations = { * passes it the address of befs_get_block, for mapping file * positions to disk blocks. */ -static int -befs_read_folio(struct file *file, struct folio *folio) +static int befs_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, befs_get_block); } @@ -470,13 +469,12 @@ befs_destroy_inodecache(void) */ static int befs_symlink_read_folio(struct file *unused, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct befs_inode_info *befs_ino = BEFS_I(inode); befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; - char *link = page_address(page); + char *link = folio_address(folio); if (len == 0 || len > PAGE_SIZE) { befs_error(sb, "Long symlink with illegal length"); @@ -489,12 +487,12 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio) goto fail; } link[len - 1] = '\0'; - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_set_error(folio); + folio_unlock(folio); return -EIO; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 07960529b360..6e2596ddae10 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -13,7 +13,6 @@ struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); -typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ebc392ea1d74..d385357e19b6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2028,10 +2028,29 @@ out: return ret; } +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) + void *ctx, bool ignore_offset) { int ret; u64 extent_item_pos; @@ -2049,17 +2068,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx, ignore_offset); + build_ino_list, ctx, ignore_offset); return ret; } -typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx); +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, struct inode_fs_paths *ipath); -static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) { int ret = 0; int slot; @@ -2068,6 +2085,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u32 name_len; u64 parent = 0; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2103,8 +2122,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, fs_root->root_key.objectid); - ret = iterate(parent, name_len, - (unsigned long)(iref + 1), eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)(iref + 1), eb, ipath); if (ret) break; len = sizeof(*iref) + name_len; @@ -2118,15 +2137,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath) { int ret; int slot; u64 offset = 0; u64 parent; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_extref *extref; u32 item_size; @@ -2162,8 +2181,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, extref = (struct btrfs_inode_extref *)(ptr + cur_offset); parent = btrfs_inode_extref_parent(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref); - ret = iterate(parent, name_len, - (unsigned long)&extref->name, eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)&extref->name, eb, ipath); if (ret) break; @@ -2180,34 +2199,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, iterate_irefs_t *iterate, - void *ctx) -{ - int ret; - int found_refs = 0; - - ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); - if (!ret) - ++found_refs; - else if (ret != -ENOENT) - return ret; - - ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); - if (ret == -ENOENT && found_refs) - return 0; - - return ret; -} - /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx) + struct extent_buffer *eb, struct inode_fs_paths *ipath) { - struct inode_fs_paths *ipath = ctx; char *fspath; char *fspath_min; int i = ipath->fspath->elem_cnt; @@ -2248,8 +2246,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, */ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { - return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, ipath); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, ipath); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; } struct btrfs_data_container *init_data_container(u32 total_bytes) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ba454032dbe2..2759de7d324c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -35,8 +35,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, bool ignore_offset); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, + struct btrfs_path *path, void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ede389f2602d..c3aecfb0a71d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1051,8 +1051,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); + WARN_ON(block_group->zone_is_active && + block_group->space_info->active_total_bytes + < block_group->length); } block_group->space_info->total_bytes -= block_group->length; + if (block_group->zone_is_active) + block_group->space_info->active_total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); block_group->space_info->bytes_zone_unusable -= @@ -1816,11 +1821,10 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { stripe_nr = stripe_nr * map->num_stripes + i; stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; } /* * The remaining case would be for RAID56, multiply by @@ -2108,7 +2112,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, cache->used, cache->bytes_super, - cache->zone_unusable, &space_info); + cache->zone_unusable, cache->zone_is_active, + &space_info); cache->space_info = space_info; @@ -2178,7 +2183,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, 0, &space_info); + 0, 0, false, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -2559,7 +2564,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, cache->zone_unusable, - &cache->space_info); + cache->zone_is_active, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2659,6 +2664,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; + /* + * We have allocated a new chunk. We also need to activate that chunk to + * grant metadata tickets for zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + if (ret < 0) + goto out; + ret = inc_block_group_ro(cache, 0); if (ret == -ETXTBSY) goto unlock_out; @@ -3761,6 +3774,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, * attempt. */ wait_for_alloc = true; + force = CHUNK_ALLOC_NO_FORCE; spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); @@ -3884,6 +3898,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, ret = PTR_ERR(bg); } else { /* + * We have a new chunk. We also need to activate it for + * zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + if (ret < 0) + return; + + /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at * btrfs_create_pending_block_groups(). So ignore diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 3ac668ace50a..35e0e860cc0b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -104,6 +104,7 @@ struct btrfs_block_group { unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; unsigned int zone_is_active:1; + unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b3ee49b0b1e8..06be0644dd37 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -118,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; + block_rsv->full = true; } else { num_bytes = 0; } @@ -142,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, bytes_to_add = min(num_bytes, bytes_to_add); dest->reserved += bytes_to_add; if (dest->reserved >= dest->size) - dest->full = 1; + dest->full = true; num_bytes -= bytes_to_add; } spin_unlock(&dest->lock); @@ -171,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); @@ -180,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type) + enum btrfs_rsv_type type) { btrfs_init_block_rsv(rsv, type); rsv->space_info = btrfs_find_space_info(fs_info, @@ -188,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) + enum btrfs_rsv_type type) { struct btrfs_block_rsv *block_rsv; @@ -304,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) if (block_rsv->reserved >= num_bytes) { block_rsv->reserved -= num_bytes; if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; + block_rsv->full = false; ret = 0; } spin_unlock(&block_rsv->lock); @@ -319,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, if (update_size) block_rsv->size += num_bytes; else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; + block_rsv->full = true; spin_unlock(&block_rsv->lock); } @@ -341,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, } global_rsv->reserved -= num_bytes; if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; + global_rsv->full = false; spin_unlock(&global_rsv->lock); btrfs_block_rsv_add_bytes(dest, num_bytes, true); @@ -408,10 +408,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) btrfs_try_granting_tickets(fs_info, sinfo); } - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; + block_rsv->full = (block_rsv->reserved == block_rsv->size); if (block_rsv->size >= sinfo->total_bytes) sinfo->force_alloc = CHUNK_ALLOC_FORCE; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 3b67ff08d434..0c183709be00 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum; /* * Types of block reserves */ -enum { +enum btrfs_rsv_type { BTRFS_BLOCK_RSV_GLOBAL, BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, @@ -25,9 +25,10 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned short full; - unsigned short type; - unsigned short failfast; + bool full; + bool failfast; + /* Block reserve type, one of BTRFS_BLOCK_RSV_* */ + enum btrfs_rsv_type type:8; /* * Qgroup equivalent for @size @reserved @@ -49,13 +50,13 @@ struct btrfs_block_rsv { u64 qgroup_rsv_reserved; }; -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type); void btrfs_init_root_block_rsv(struct btrfs_root *root); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 33811e896623..b160b8e124e0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,19 +279,31 @@ static inline void btrfs_insert_inode_hash(struct inode *inode) __insert_inode_hash(inode, h); } +#if BITS_PER_LONG == 32 + +/* + * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so + * we use the inode's location objectid which is a u64 to avoid truncation. + */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; - /* - * !ino: btree_inode - * type == BTRFS_ROOT_ITEM_KEY: subvol dir - */ - if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY) + /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ + if (inode->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->vfs_inode.i_ino; return ino; } +#else + +static inline u64 btrfs_ino(const struct btrfs_inode *inode) +{ + return inode->vfs_inode.i_ino; +} + +#endif + static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); @@ -305,8 +317,7 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) if (root == root->fs_info->tree_root && btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) return true; - if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; + return false; } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5d20137b7b67..98c6e5feab19 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -152,7 +152,7 @@ struct btrfsic_block { struct btrfsic_block *next_in_same_bio; void *orig_bio_private; bio_end_io_t *orig_bio_end_io; - int submit_bio_bh_rw; + blk_opf_t submit_bio_bh_rw; u64 flush_gen; /* only valid if !never_written */ }; @@ -1681,7 +1681,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - int submit_bio_bh_rw) + blk_opf_t submit_bio_bh_rw) { int is_metadata; struct btrfsic_block *block; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f4564f32f6d9..e84d22c5c6a8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -136,109 +136,14 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, - unsigned long disk_size) -{ - return sizeof(struct compressed_bio) + - (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size; -} - -static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, - u64 disk_start) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - const u32 csum_size = fs_info->csum_size; - const u32 sectorsize = fs_info->sectorsize; - struct page *page; - unsigned int i; - char *kaddr; - u8 csum[BTRFS_CSUM_SIZE]; - struct compressed_bio *cb = bio->bi_private; - u8 *cb_sum = cb->sums; - - if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) - return 0; - - shash->tfm = fs_info->csum_shash; - - for (i = 0; i < cb->nr_pages; i++) { - u32 pg_offset; - u32 bytes_left = PAGE_SIZE; - page = cb->compressed_pages[i]; - - /* Determine the remaining bytes inside the page first */ - if (i == cb->nr_pages - 1) - bytes_left = cb->compressed_len - i * PAGE_SIZE; - - /* Hash through the page sector by sector */ - for (pg_offset = 0; pg_offset < bytes_left; - pg_offset += sectorsize) { - kaddr = kmap_atomic(page); - crypto_shash_digest(shash, kaddr + pg_offset, - sectorsize, csum); - kunmap_atomic(kaddr); - - if (memcmp(&csum, cb_sum, csum_size) != 0) { - btrfs_print_data_csum_error(inode, disk_start, - csum, cb_sum, cb->mirror_num); - if (btrfs_bio(bio)->device) - btrfs_dev_stat_inc_and_print( - btrfs_bio(bio)->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - return -EIO; - } - cb_sum += csum_size; - disk_start += sectorsize; - } - } - return 0; -} - -/* - * Reduce bio and io accounting for a compressed_bio with its corresponding bio. - * - * Return true if there is no pending bio nor io. - * Return false otherwise. - */ -static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - unsigned int bi_size = 0; - bool last_io = false; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * At endio time, bi_iter.bi_size doesn't represent the real bio size. - * Thus here we have to iterate through all segments to grab correct - * bio size. - */ - bio_for_each_segment_all(bvec, bio, iter_all) - bi_size += bvec->bv_len; - - if (bio->bi_status) - cb->status = bio->bi_status; - - ASSERT(bi_size && bi_size <= cb->compressed_len); - last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, - &cb->pending_sectors); - /* - * Here we must wake up the possible error handler after all other - * operations on @cb finished, or we can race with - * finish_compressed_bio_*() which may free @cb. - */ - wake_up_var(cb); - - return last_io; -} - static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; + if (cb->status == BLK_STS_OK) + cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); + /* Release the compressed pages */ for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; @@ -247,85 +152,63 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) } /* Do io completion on the original bio */ - if (cb->status != BLK_STS_OK) { + if (cb->status != BLK_STS_OK) cb->orig_bio->bi_status = cb->status; - bio_endio(cb->orig_bio); - } else { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * We have verified the checksum already, set page checked so - * the end_io handlers know about it - */ - ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { - u64 bvec_start = page_offset(bvec->bv_page) + - bvec->bv_offset; - - btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), - bvec->bv_page, bvec_start, - bvec->bv_len); - } - - bio_endio(cb->orig_bio); - } + bio_endio(cb->orig_bio); /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); } -/* when we finish reading compressed pages from the disk, we - * decompress them and then run the bio end_io routines on the - * decompressed pages (in the inode address space). - * - * This allows the checksumming and other IO error handling routines - * to work normally - * - * The compressed pages are freed here, and it must be run - * in process context +/* + * Verify the checksums and kick off repair if needed on the uncompressed data + * before decompressing it into the original bio and freeing the uncompressed + * pages. */ static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - unsigned int mirror = btrfs_bio(bio)->mirror_num; - int ret = 0; - - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; - - /* - * Record the correct mirror_num in cb->orig_bio so that - * read-repair can work properly. - */ - btrfs_bio(cb->orig_bio)->mirror_num = mirror; - cb->mirror_num = mirror; - - /* - * Some IO in this cb have failed, just skip checksum as there - * is no way it could be correct. - */ - if (cb->status != BLK_STS_OK) - goto csum_failed; - - inode = cb->inode; - ret = check_compressed_csum(BTRFS_I(inode), bio, - bio->bi_iter.bi_sector << 9); - if (ret) - goto csum_failed; + struct inode *inode = cb->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *bi = BTRFS_I(inode); + bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + blk_status_t status = bio->bi_status; + struct btrfs_bio *bbio = btrfs_bio(bio); + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (!status && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, + bv.bv_page, bv.bv_offset))) { + clean_io_failure(fs_info, &bi->io_failure_tree, + &bi->io_tree, start, bv.bv_page, + btrfs_ino(bi), bv.bv_offset); + } else { + int ret; + + refcount_inc(&cb->pending_ios); + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + btrfs_submit_data_read_bio); + if (ret) { + refcount_dec(&cb->pending_ios); + status = errno_to_blk_status(ret); + } + } + } - /* ok, we're the last bio for this extent, lets start - * the decompression. - */ - ret = btrfs_decompress_bio(cb); + if (status) + cb->status = status; -csum_failed: - if (ret) - cb->status = errno_to_blk_status(ret); - finish_compressed_bio_read(cb); -out: + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); + btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -403,6 +286,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) kfree(cb); } +static void btrfs_finish_compressed_write_work(struct work_struct *work) +{ + struct compressed_bio *cb = + container_of(work, struct compressed_bio, write_end_work); + + finish_compressed_bio_write(cb); +} + /* * Do the cleanup once all the compressed pages hit the disk. This will clear * writeback on the file pages and free the compressed pages. @@ -414,29 +305,18 @@ static void end_compressed_bio_write(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; + if (bio->bi_status) + cb->status = bio->bi_status; - btrfs_record_physical_zoned(cb->inode, cb->start, bio); + if (refcount_dec_and_test(&cb->pending_ios)) { + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - finish_compressed_bio_write(cb); -out: + btrfs_record_physical_zoned(cb->inode, cb->start, bio); + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + } bio_put(bio); } -static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, - struct bio *bio, int mirror_num) -{ - blk_status_t ret; - - ASSERT(bio->bi_iter.bi_size); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - return ret; -} - /* * Allocate a compressed_bio, which will be used to read/write on-disk * (aka, compressed) * data. @@ -455,7 +335,7 @@ static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - unsigned int opf, bio_end_io_t endio_func, + blk_opf_t opf, bio_end_io_t endio_func, u64 *next_stripe_start) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); @@ -487,7 +367,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte return ERR_PTR(ret); } *next_stripe_start = disk_bytenr + geom.len; - + refcount_inc(&cb->pending_ios); return bio; } @@ -505,7 +385,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, - unsigned int write_flags, + blk_opf_t write_flags, struct cgroup_subsys_state *blkcg_css, bool writeback) { @@ -514,26 +394,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; u64 next_stripe_start; - blk_status_t ret; + blk_status_t ret = BLK_STS_OK; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); - const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; + const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); + refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; - cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; cb->writeback = writeback; - cb->orig_bio = NULL; + INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; if (blkcg_css) @@ -554,8 +433,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, &next_stripe_start); if (IS_ERR(bio)) { ret = errno_to_blk_status(PTR_ERR(bio)); - bio = NULL; - goto finish_cb; + break; } if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; @@ -599,44 +477,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) - goto finish_cb; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + break; + } } - ret = submit_compressed_bio(fs_info, bio, 0); - if (ret) - goto finish_cb; + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(fs_info, bio, 0); bio = NULL; } cond_resched(); } - if (blkcg_css) - kthread_associate_blkcg(NULL); - - return 0; -finish_cb: if (blkcg_css) kthread_associate_blkcg(NULL); - if (bio) { - bio->bi_status = ret; - bio_endio(bio); - } - /* Last byte of @cb is submitted, endio will free @cb */ - if (cur_disk_bytenr == disk_start + compressed_len) - return ret; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_start + compressed_len - cur_disk_bytenr) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_write(cb); + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_write(cb); return ret; } @@ -765,7 +624,6 @@ static noinline int add_ra_bio_pages(struct inode *inode, int zeros; zeros = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, zeros); - flush_dcache_page(page); } } @@ -819,7 +677,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, blk_status_t ret; int ret2; int i; - u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -837,17 +694,15 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) { ret = BLK_STS_RESOURCE; goto out; } - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); + refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; - cb->mirror_num = mirror_num; - sums = cb->sums; cb->start = em->orig_start; em_len = em->len; @@ -893,9 +748,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, REQ_OP_READ, end_compressed_bio_read, &next_stripe_start); if (IS_ERR(comp_bio)) { - ret = errno_to_blk_status(PTR_ERR(comp_bio)); - comp_bio = NULL; - goto finish_cb; + cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); + break; } } /* @@ -931,22 +785,33 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, submit = true; if (submit) { - unsigned int nr_sectors; + /* Save the original iter for read repair */ + if (bio_op(comp_bio) == REQ_OP_READ) + btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - if (ret) - goto finish_cb; + /* + * Save the initial offset of this chunk, as there + * is no direct correlation between compressed pages and + * the original file offset. The field is only used for + * priting error messages. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; - nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); - sums += fs_info->csum_size * nr_sectors; + ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); + if (ret) { + comp_bio->bi_status = ret; + bio_endio(comp_bio); + break; + } - ret = submit_compressed_bio(fs_info, comp_bio, mirror_num); - if (ret) - goto finish_cb; + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(fs_info, comp_bio, mirror_num); comp_bio = NULL; } } + + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); return; fail: @@ -964,25 +829,6 @@ out: bio->bi_status = ret; bio_endio(bio); return; -finish_cb: - if (comp_bio) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - /* All bytes of @cb is submitted, endio will free @cb */ - if (cur_disk_byte == disk_bytenr + compressed_len) - return; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_bytenr + compressed_len - cur_disk_byte) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish @cb manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb); } /* @@ -1481,7 +1327,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, ASSERT(copy_start - decompressed < buf_len); memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + copy_start - decompressed, copy_len); - flush_dcache_page(bvec.bv_page); cur_offset += copy_len; bio_advance(orig_bio, copy_len); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2707404389a5..1aa02903de69 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of sectors with unfinished IO (unsubmitted or unfinished) */ - refcount_t pending_sectors; + /* Number of outstanding bios */ + refcount_t pending_ios; /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -59,16 +59,12 @@ struct compressed_bio { /* IO errors */ blk_status_t status; - int mirror_num; - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u8 sums[]; + union { + /* For reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + struct work_struct write_end_work; + }; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -99,7 +95,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, - unsigned int write_flags, + blk_opf_t write_flags, struct cgroup_subsys_state *blkcg_css, bool writeback); void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e49b1a0c071..4db85b9dc7ed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -107,14 +107,6 @@ struct btrfs_ioctl_encoded_io_args; #define BTRFS_STAT_CURR 0 #define BTRFS_STAT_PREV 1 -/* - * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size - */ -static inline u32 count_max_extents(u64 size) -{ - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -} - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -230,6 +222,13 @@ struct btrfs_root_backup { #define BTRFS_SUPER_INFO_SIZE 4096 /* + * The reserved space at the beginning of each device. + * It covers the primary super block and leaves space for potential use by other + * tools like bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) + +/* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ @@ -248,8 +247,12 @@ struct btrfs_super_block { __le64 chunk_root; __le64 log_root; - /* this will help find the new super based on the log root */ - __le64 log_root_transid; + /* + * This member has never been utilized since the very beginning, thus + * it's always 0 regardless of kernel version. We always use + * generation + 1 to read log tree root. So here we mark it deprecated. + */ + __le64 __unused_log_root_transid; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; @@ -635,6 +638,9 @@ enum { /* Indicate we have half completed snapshot deletions pending. */ BTRFS_FS_UNFINISHED_DROPS, + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -656,6 +662,18 @@ enum btrfs_exclusive_operation { BTRFS_EXCLOP_SWAP_ACTIVATE, }; +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -675,9 +693,8 @@ struct btrfs_fs_info { rwlock_t global_root_lock; struct rb_root global_root_tree; - /* The xarray that holds all the FS roots */ - spinlock_t fs_roots_lock; - struct xarray fs_roots; + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; /* block group cache stuff */ rwlock_t block_group_cache_lock; @@ -851,11 +868,11 @@ struct btrfs_fs_info { struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; - struct btrfs_workqueue *endio_workers; - struct btrfs_workqueue *endio_meta_workers; - struct btrfs_workqueue *endio_raid56_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *endio_raid56_workers; struct workqueue_struct *rmw_workers; - struct btrfs_workqueue *endio_meta_write_workers; + struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; @@ -995,10 +1012,10 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer xarray */ + /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct xarray extent_buffers; + struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ int backup_root_index; @@ -1033,6 +1050,12 @@ struct btrfs_fs_info { u32 csums_per_leaf; u32 stripesize; + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + /* Block groups and devices containing active swapfiles. */ spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; @@ -1048,6 +1071,8 @@ struct btrfs_fs_info { */ u64 zone_size; + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; @@ -1064,6 +1089,11 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; + /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ + wait_queue_head_t zone_finish_wait; + + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; @@ -1119,8 +1149,7 @@ enum { */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, - /* The root is tracked in fs_info::fs_roots */ - BTRFS_ROOT_REGISTERED, + BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, @@ -1224,10 +1253,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * Xarray that keeps track of delayed nodes of every inode, protected - * by inode_lock + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock */ - struct xarray delayed_nodes; + struct radix_tree_root delayed_nodes_tree; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -1330,6 +1359,8 @@ struct btrfs_replace_extent_info { * existing extent into a file range. */ bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* @@ -2475,8 +2506,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, - log_root_transid, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, @@ -2733,8 +2762,16 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); +static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, + u64 offset) +{ + u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; + + return csums + offset_in_sectors * fs_info->csum_size; +} + /* - * Take the number of bytes to be checksummmed and figure out how many leaves + * Take the number of bytes to be checksummed and figure out how many leaves * it would require to store the csums for that many bytes. */ static inline u64 btrfs_csum_bytes_to_leaves( @@ -3251,11 +3288,18 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ -void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3305,9 +3349,9 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, struct inode *dir); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits); + u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, unsigned *bits); + struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, @@ -3353,6 +3397,12 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, @@ -4009,6 +4059,19 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zone_size > 0; } +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 36ab0859a263..1e8f17ff829e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -273,7 +273,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 disk_num_bytes, u64 *meta_reserve, u64 *qgroup_reserve) { - u64 nr_extents = count_max_extents(num_bytes); + u64 nr_extents = count_max_extents(fs_info, num_bytes); u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); @@ -350,7 +350,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * needs to free the reservation we just made. */ spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); + nr_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); @@ -413,7 +413,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) unsigned num_extents; spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); + num_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, -num_extents); btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 66779ab3ed4a..e7f34871a132 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node( INIT_LIST_HEAD(&delayed_node->p_list); } -static inline int btrfs_is_continuous_delayed_item( - struct btrfs_delayed_item *item1, - struct btrfs_delayed_item *item2) -{ - if (item1->key.type == BTRFS_DIR_INDEX_KEY && - item1->key.objectid == item2->key.objectid && - item1->key.type == item2->key.type && - item1->key.offset + 1 == item2->key.offset) - return 1; - return 0; -} - static struct btrfs_delayed_node *btrfs_get_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -78,7 +66,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = xa_load(&root->delayed_nodes, ino); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -90,9 +78,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the xarray. In this case, the refcount + * this node from the radix tree. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the xarray at all; our release + * NULL like it was never in the radix at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -100,7 +88,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the xarray, we want to bump the + * If this node is properly in the radix, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -128,30 +116,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( u64 ino = btrfs_ino(btrfs_inode); int ret; - do { - node = btrfs_get_delayed_node(btrfs_inode); - if (node) - return node; +again: + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed */ - refcount_set(&node->refs, 2); + /* cached in the btrfs inode and can be accessed */ + refcount_set(&node->refs, 2); - spin_lock(&root->inode_lock); - ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); - if (ret) { - spin_unlock(&root->inode_lock); - kmem_cache_free(delayed_node_cache, node); - if (ret != -EBUSY) - return ERR_PTR(ret); - } - } while (ret); + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + radix_tree_preload_end(); + goto again; + } btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); + radix_tree_preload_end(); return node; } @@ -270,7 +264,8 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - xa_erase(&root->delayed_nodes, delayed_node->inode_id); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -391,8 +386,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( } static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, - struct btrfs_delayed_item *ins, - int action) + struct btrfs_delayed_item *ins) { struct rb_node **p, *node; struct rb_node *parent_node = NULL; @@ -401,9 +395,9 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, int cmp; bool leftmost = true; - if (action == BTRFS_DELAYED_INSERTION_ITEM) + if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; - else if (action == BTRFS_DELAYED_DELETION_ITEM) + else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) root = &delayed_node->del_root; else BUG(); @@ -429,32 +423,19 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); ins->delayed_node = delayed_node; - ins->ins_or_del = action; - if (ins->key.type == BTRFS_DIR_INDEX_KEY && - action == BTRFS_DELAYED_INSERTION_ITEM && + /* Delayed items are always for dir index items. */ + ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY); + + if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM && ins->key.offset >= delayed_node->index_cnt) - delayed_node->index_cnt = ins->key.offset + 1; + delayed_node->index_cnt = ins->key.offset + 1; delayed_node->count++; atomic_inc(&delayed_node->root->fs_info->delayed_root->items); return 0; } -static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_INSERTION_ITEM); -} - -static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_DELETION_ITEM); -} - static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); @@ -566,7 +547,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, num_bytes, 1); - item->bytes_reserved = num_bytes; + /* + * For insertions we track reserved metadata space by accounting + * for the number of leaves that will be used, based on the delayed + * node's index_items_size field. + */ + if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) + item->bytes_reserved = num_bytes; } return ret; @@ -592,6 +579,21 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } +static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, + unsigned int num_leaves) +{ + struct btrfs_fs_info *fs_info = node->root->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); + + /* There are no space reservations during log replay, bail out. */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, + bytes, 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); +} + static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -665,22 +667,53 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, } /* - * Insert a single delayed item or a batch of delayed items that have consecutive - * keys if they exist. + * Insert a single delayed item or a batch of delayed items, as many as possible + * that fit in a leaf. The delayed items (dir index keys) are sorted by their key + * in the rbtree, and if there's a gap between two consecutive dir index items, + * then it means at some point we had delayed dir indexes to add but they got + * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them + * into the subvolume tree. Dir index keys also have their offsets coming from a + * monotonically increasing counter, so we can't get new keys with an offset that + * fits within a gap between delayed dir index items. */ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_delayed_item *first_item) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_delayed_node *node = first_item->delayed_node; LIST_HEAD(item_list); struct btrfs_delayed_item *curr; struct btrfs_delayed_item *next; - const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_item_batch batch; int total_size; char *ins_data = NULL; int ret; + bool continuous_keys_only = false; + + lockdep_assert_held(&node->mutex); + + /* + * During normal operation the delayed index offset is continuously + * increasing, so we can batch insert all items as there will not be any + * overlapping keys in the tree. + * + * The exception to this is log replay, where we may have interleaved + * offsets in the tree, so our batch needs to be continuous keys only in + * order to ensure we do not end up with out of order items in our leaf. + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + continuous_keys_only = true; + + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(first_item->bytes_reserved == 0); list_add_tail(&first_item->tree_list, &item_list); batch.total_data_size = first_item->data_len; @@ -692,9 +725,19 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, int next_size; next = __btrfs_next_delayed_item(curr); - if (!next || !btrfs_is_continuous_delayed_item(curr, next)) + if (!next) + break; + + /* + * We cannot allow gaps in the key space if we're doing log + * replay. + */ + if (continuous_keys_only && + (next->key.offset != curr->key.offset + 1)) break; + ASSERT(next->bytes_reserved == 0); + next_size = next->data_len + sizeof(struct btrfs_item); if (total_size + next_size > max_size) break; @@ -751,9 +794,41 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, */ btrfs_release_path(path); + ASSERT(node->index_item_leaves > 0); + + /* + * For normal operations we will batch an entire leaf's worth of delayed + * items, so if there are more items to process we can decrement + * index_item_leaves by 1 as we inserted 1 leaf's worth of items. + * + * However for log replay we may not have inserted an entire leaf's + * worth of items, we may have not had continuous items, so decrementing + * here would mess up the index_item_leaves accounting. For this case + * only clean up the accounting when there are no items left. + */ + if (next && !continuous_keys_only) { + /* + * We inserted one batch of items into a leaf a there are more + * items to flush in a future batch, now release one unit of + * metadata space from the delayed block reserve, corresponding + * the leaf we just flushed to. + */ + btrfs_delayed_item_release_leaves(node, 1); + node->index_item_leaves--; + } else if (!next) { + /* + * There are no more items to insert. We can have a number of + * reserved leaves > 1 here - this happens when many dir index + * items are added and then removed before they are flushed (file + * names with a very short life, never span a transaction). So + * release all remaining leaves. + */ + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + list_for_each_entry_safe(curr, next, &item_list, tree_list) { list_del(&curr->tree_list); - btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); } out: @@ -789,62 +864,75 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; - struct extent_buffer *leaf; - struct btrfs_key key; - struct list_head head; - int nitems, i, last_item; - int ret = 0; + struct extent_buffer *leaf = path->nodes[0]; + LIST_HEAD(batch_list); + int nitems, slot, last_slot; + int ret; + u64 total_reserved_size = item->bytes_reserved; - BUG_ON(!path->nodes[0]); + ASSERT(leaf != NULL); - leaf = path->nodes[0]; + slot = path->slots[0]; + last_slot = btrfs_header_nritems(leaf) - 1; + /* + * Our caller always gives us a path pointing to an existing item, so + * this can not happen. + */ + ASSERT(slot <= last_slot); + if (WARN_ON(slot > last_slot)) + return -ENOENT; - i = path->slots[0]; - last_item = btrfs_header_nritems(leaf) - 1; - if (i > last_item) - return -ENOENT; /* FIXME: Is errno suitable? */ + nitems = 1; + curr = item; + list_add_tail(&curr->tree_list, &batch_list); - next = item; - INIT_LIST_HEAD(&head); - btrfs_item_key_to_cpu(leaf, &key, i); - nitems = 0; /* - * count the number of the dir index items that we can delete in batch + * Keep checking if the next delayed item matches the next item in the + * leaf - if so, we can add it to the batch of items to delete from the + * leaf. */ - while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { - list_add_tail(&next->tree_list, &head); - nitems++; + while (slot < last_slot) { + struct btrfs_key key; - curr = next; next = __btrfs_next_delayed_item(curr); if (!next) break; - if (!btrfs_is_continuous_delayed_item(curr, next)) - break; - - i++; - if (i > last_item) + slot++; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_comp_cpu_keys(&next->key, &key) != 0) break; - btrfs_item_key_to_cpu(leaf, &key, i); + nitems++; + curr = next; + list_add_tail(&curr->tree_list, &batch_list); + total_reserved_size += curr->bytes_reserved; } - if (!nitems) - return 0; - ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); if (ret) - goto out; + return ret; + + /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ + if (total_reserved_size > 0) { + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we + * don't need to release/reserve qgroup space. + */ + trace_btrfs_space_reservation(fs_info, "delayed_item", + item->key.objectid, total_reserved_size, + 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, + total_reserved_size, NULL); + } - list_for_each_entry_safe(curr, next, &head, tree_list) { - btrfs_delayed_item_release_metadata(root, curr); + list_for_each_entry_safe(curr, next, &batch_list, tree_list) { list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } -out: - return ret; + return 0; } static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, @@ -852,43 +940,52 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_node *node) { - struct btrfs_delayed_item *curr, *prev; int ret = 0; -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_deletion_item(node); - if (!curr) - goto delete_fail; + while (ret == 0) { + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_deletion_item(node); + if (!item) { + mutex_unlock(&node->mutex); + break; + } + + ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1); + if (ret > 0) { + /* + * There's no matching item in the leaf. This means we + * have already deleted this item in a past run of the + * delayed items. We ignore errors when running delayed + * items from an async context, through a work queue job + * running btrfs_async_run_delayed_root(), and don't + * release delayed items that failed to complete. This + * is because we will retry later, and at transaction + * commit time we always run delayed items and will + * then deal with errors if they fail to run again. + * + * So just release delayed items for which we can't find + * an item in the tree, and move to the next item. + */ + btrfs_release_path(path); + btrfs_release_delayed_item(item); + ret = 0; + } else if (ret == 0) { + ret = btrfs_batch_delete_items(trans, root, path, item); + btrfs_release_path(path); + } - ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); - if (ret < 0) - goto delete_fail; - else if (ret > 0) { /* - * can't find the item which the node points to, so this node - * is invalid, just drop it. + * We unlock and relock on each iteration, this is to prevent + * blocking other tasks for too long while we are being run from + * the async context (work queue job). Those tasks are typically + * running system calls like creat/mkdir/rename/unlink/etc which + * need to add delayed items to this delayed node. */ - prev = curr; - curr = __btrfs_next_delayed_item(prev); - btrfs_release_delayed_item(prev); - ret = 0; - btrfs_release_path(path); - if (curr) { - mutex_unlock(&node->mutex); - goto do_again; - } else - goto delete_fail; + mutex_unlock(&node->mutex); } - btrfs_batch_delete_items(trans, root, path, curr); - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -delete_fail: - btrfs_release_path(path); - mutex_unlock(&node->mutex); return ret; } @@ -1347,9 +1444,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_disk_key *disk_key, u8 type, u64 index) { + struct btrfs_fs_info *fs_info = trans->fs_info; + const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; + bool reserve_leaf_space; + u32 data_len; int ret; delayed_node = btrfs_get_or_create_delayed_node(dir); @@ -1365,6 +1466,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, delayed_item->key.objectid = btrfs_ino(dir); delayed_item->key.type = BTRFS_DIR_INDEX_KEY; delayed_item->key.offset = index; + delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM; dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; @@ -1374,15 +1476,52 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + data_len = delayed_item->data_len + sizeof(struct btrfs_item); mutex_lock(&delayed_node->mutex); - ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); + + if (delayed_node->index_item_leaves == 0 || + delayed_node->curr_index_batch_size + data_len > leaf_data_size) { + delayed_node->curr_index_batch_size = data_len; + reserve_leaf_space = true; + } else { + delayed_node->curr_index_batch_size += data_len; + reserve_leaf_space = false; + } + + if (reserve_leaf_space) { + ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, + delayed_item); + /* + * Space was reserved for a dir index item insertion when we + * started the transaction, so getting a failure here should be + * impossible. + */ + if (WARN_ON(ret)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_item(delayed_item); + goto release_node; + } + + delayed_node->index_item_leaves++; + } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; + } + + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1410,8 +1549,37 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, return 1; } - btrfs_delayed_item_release_metadata(node->root, item); + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(item->bytes_reserved == 0); + ASSERT(node->index_item_leaves > 0); + + /* + * If there's only one leaf reserved, we can decrement this item from the + * current batch, otherwise we can not because we don't know which leaf + * it belongs to. With the current limit on delayed items, we rarely + * accumulate enough dir index items to fill more than one leaf (even + * when using a leaf size of 4K). + */ + if (node->index_item_leaves == 1) { + const u32 data_len = item->data_len + sizeof(struct btrfs_item); + + ASSERT(node->curr_index_batch_size >= data_len); + node->curr_index_batch_size -= data_len; + } + btrfs_release_delayed_item(item); + + /* If we now have no more dir index items, we can release all leaves. */ + if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + mutex_unlock(&node->mutex); return 0; } @@ -1444,6 +1612,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } item->key = item_key; + item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM; ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); /* @@ -1458,7 +1627,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_lock(&node->mutex); - ret = __btrfs_add_delayed_deletion_item(node, item); + ret = __btrfs_add_delayed_item(node, item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1826,12 +1995,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) mutex_lock(&delayed_node->mutex); curr_item = __btrfs_first_delayed_insertion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); } + if (delayed_node->index_item_leaves > 0) { + btrfs_delayed_item_release_leaves(delayed_node, + delayed_node->index_item_leaves); + delayed_node->index_item_leaves = 0; + } + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); while (curr_item) { btrfs_delayed_item_release_metadata(root, curr_item); @@ -1863,35 +2037,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - unsigned long index = 0; - struct btrfs_delayed_node *delayed_node; + u64 inode_id = 0; struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; while (1) { - int n = 0; - spin_lock(&root->inode_lock); - if (xa_empty(&root->delayed_nodes)) { + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { spin_unlock(&root->inode_lock); - return; + break; } - xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { + inode_id = delayed_nodes[n - 1]->inode_id + 1; + for (i = 0; i < n; i++) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (refcount_inc_not_zero(&delayed_node->refs)) { - delayed_nodes[n] = delayed_node; - n++; - } - if (n >= ARRAY_SIZE(delayed_nodes)) - break; + if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) + delayed_nodes[i] = NULL; } - index++; spin_unlock(&root->inode_lock); - for (int i = 0; i < n; i++) { + for (i = 0; i < n; i++) { + if (!delayed_nodes[i]) + continue; __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index b2412160c5bc..9795dc295a18 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -58,6 +58,17 @@ struct btrfs_delayed_node { u64 index_cnt; unsigned long flags; int count; + /* + * The size of the next batch of dir index items to insert (if this + * node is from a directory inode). Protected by @mutex. + */ + u32 curr_index_batch_size; + /* + * Number of leaves reserved for inserting dir index items (if this + * node belongs to a directory inode). This may be larger then the + * actual number of leaves we end up using. Protected by @mutex. + */ + u32 index_item_leaves; }; struct btrfs_delayed_item { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 99f37fca2e96..36a3debe9493 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -132,7 +132,7 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; + delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; } @@ -175,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, if (num_bytes) delayed_refs_rsv->reserved += num_bytes; if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; + delayed_refs_rsv->full = true; spin_unlock(&delayed_refs_rsv->lock); if (num_bytes) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a7dd6ba25e99..f43196a893ca 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -587,7 +587,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, ASSERT(!IS_ERR(em)); map = em->map_lookup; - num_extents = cur_extent = 0; + num_extents = 0; + cur_extent = 0; for (i = 0; i < map->num_stripes; i++) { /* We have more device extent to copy */ if (srcdev != map->stripes[i].dev) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 89e94ea2fef5..4c3166f3c725 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,6 +5,7 @@ #include <linux/fs.h> #include <linux/blkdev.h> +#include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> @@ -50,7 +51,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -63,40 +63,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); -/* - * btrfs_end_io_wq structs are used to do processing in task context when an IO - * is complete. This is used during reads to verify checksums, and it is used - * by writes to insert metadata for new file extents after IO is complete. - */ -struct btrfs_end_io_wq { - struct bio *bio; - bio_end_io_t *end_io; - void *private; - struct btrfs_fs_info *info; - blk_status_t status; - enum btrfs_wq_endio_type metadata; - struct btrfs_work work; -}; - -static struct kmem_cache *btrfs_end_io_wq_cache; - -int __init btrfs_end_io_wq_init(void) -{ - btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", - sizeof(struct btrfs_end_io_wq), - 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_end_io_wq_cache) - return -ENOMEM; - return 0; -} - -void __cold btrfs_end_io_wq_exit(void) -{ - kmem_cache_destroy(btrfs_end_io_wq_cache); -} - static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) { if (fs_info->csum_shash) @@ -255,8 +221,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, goto out; } btrfs_err_rl(eb->fs_info, - "parent transid verify failed on %llu wanted %llu found %llu", - eb->start, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); ret = 1; clear_extent_buffer_uptodate(eb); @@ -485,7 +451,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, fs_info->nodesize); - /* A dirty eb shouldn't disappear from extent_buffers */ + /* A dirty eb shouldn't disappear from buffer_radix */ if (WARN_ON(!eb)) return -EUCLEAN; @@ -586,21 +552,23 @@ static int validate_extent_buffer(struct extent_buffer *eb) found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", - eb->start, found_start); + btrfs_err_rl(fs_info, + "bad tree block start, mirror %u want %llu have %llu", + eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } if (check_tree_block_fsid(eb)) { - btrfs_err_rl(fs_info, "bad fsid on block %llu", - eb->start); + btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", + eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "bad tree block level %d on %llu", - (int)btrfs_header_level(eb), eb->start); + btrfs_err(fs_info, + "bad tree block level, mirror %u level %d on logical %llu", + eb->read_mirror, btrfs_header_level(eb), eb->start); ret = -EIO; goto out; } @@ -611,8 +579,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, - "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", - eb->start, +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); @@ -637,8 +605,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) set_extent_buffer_uptodate(eb); else btrfs_err(fs_info, - "block=%llu read time tree block corruption detected", - eb->start); + "read time tree block corruption detected on logical %llu mirror %u", + eb->start, eb->read_mirror); out: return ret; } @@ -739,58 +707,6 @@ err: return ret; } -static void end_workqueue_bio(struct bio *bio) -{ - struct btrfs_end_io_wq *end_io_wq = bio->bi_private; - struct btrfs_fs_info *fs_info; - struct btrfs_workqueue *wq; - - fs_info = end_io_wq->info; - end_io_wq->status = bio->bi_status; - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - wq = fs_info->endio_meta_write_workers; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - wq = fs_info->endio_freespace_worker; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else - wq = fs_info->endio_write_workers; - } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else if (end_io_wq->metadata) - wq = fs_info->endio_meta_workers; - else - wq = fs_info->endio_workers; - } - - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); - btrfs_queue_work(wq, &end_io_wq->work); -} - -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata) -{ - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); - if (!end_io_wq) - return BLK_STS_RESOURCE; - - end_io_wq->private = bio->bi_private; - end_io_wq->end_io = bio->bi_end_io; - end_io_wq->info = info; - end_io_wq->status = 0; - end_io_wq->bio = bio; - end_io_wq->metadata = metadata; - - bio->bi_private = end_io_wq; - bio->bi_end_io = end_workqueue_bio; - return 0; -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -815,7 +731,6 @@ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; struct inode *inode; - blk_status_t ret; async = container_of(work, struct async_submit_bio, work); inode = async->inode; @@ -833,11 +748,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); - if (ret) { - async->bio->bi_status = ret; - bio_endio(async->bio); - } + btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -848,16 +759,23 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) +/* + * Submit bio to an async queue. + * + * Retrun: + * - true if the work has been succesfuly submitted + * - false in case of error + */ +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return BLK_STS_RESOURCE; + return false; async->inode = inode; async->bio = bio; @@ -875,7 +793,7 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, btrfs_queue_work(fs_info->hipri_workers, &async->work); else btrfs_queue_work(fs_info->workers, &async->work); - return 0; + return true; } static blk_status_t btree_csum_one_bio(struct bio *bio) @@ -901,7 +819,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, { /* * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_map_bio + * submission context. Just jump into btrfs_submit_bio. */ return btree_csum_one_bio(bio); } @@ -923,57 +841,54 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; + bio->bi_opf |= REQ_META; + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - /* - * called for a read, do the setup so that checksum validation - * can happen in the async kernel threads - */ - ret = btrfs_bio_wq_end_io(fs_info, bio, - BTRFS_WQ_ENDIO_METADATA); - if (!ret) - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else if (!should_async_write(fs_info, BTRFS_I(inode))) { - ret = btree_csum_one_bio(bio); - if (!ret) - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else { - /* - * kthread helpers are used to submit writes so that - * checksumming can happen in parallel across all CPUs - */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - btree_submit_bio_start); + btrfs_submit_bio(fs_info, bio, mirror_num); + return; } + /* + * Kthread helpers are used to submit writes so that checksumming can + * happen in parallel across all CPUs. + */ + if (should_async_write(fs_info, BTRFS_I(inode)) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + return; + + ret = btree_csum_one_bio(bio); if (ret) { bio->bi_status = ret; bio_endio(bio); + return; } + + btrfs_submit_bio(fs_info, bio, mirror_num); } #ifdef CONFIG_MIGRATION -static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +static int btree_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { /* * we can't safely write a btree page from here, * we haven't done the locking hook */ - if (PageDirty(page)) + if (folio_test_dirty(src)) return -EAGAIN; /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) + if (folio_get_private(src) && + !filemap_release_folio(src, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, mode); + return migrate_folio(mapping, dst, src, mode); } +#else +#define btree_migrate_folio NULL #endif - static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1073,10 +988,8 @@ static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .release_folio = btree_release_folio, .invalidate_folio = btree_invalidate_folio, -#ifdef CONFIG_MIGRATION - .migratepage = btree_migratepage, -#endif - .dirty_folio = btree_dirty_folio, + .migrate_folio = btree_migrate_folio, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( @@ -1158,7 +1071,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -1210,9 +1123,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); #endif } @@ -1659,11 +1572,12 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; - spin_lock(&fs_info->fs_roots_lock); - root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); if (root) root = btrfs_grab_root(root); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1705,14 +1619,20 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - spin_lock(&fs_info->fs_roots_lock); - ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, - root, GFP_NOFS); + ret = radix_tree_preload(GFP_NOFS); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); if (ret == 0) { btrfs_grab_root(root); - set_bit(BTRFS_ROOT_REGISTERED, &root->state); + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); return ret; } @@ -1864,7 +1784,7 @@ again: fail: /* * If our caller provided us an anonymous device, then it's his - * responsability to free it in case we fail. So we have to set our + * responsibility to free it in case we fail. So we have to set our * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ @@ -1947,25 +1867,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, return root; } -/* - * called by the kthread helper functions to finally call the bio end_io - * functions. This is where read checksum verification actually happens - */ -static void end_workqueue_fn(struct btrfs_work *work) -{ - struct bio *bio; - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = container_of(work, struct btrfs_end_io_wq, work); - bio = end_io_wq->bio; - - bio->bi_status = end_io_wq->status; - bio->bi_private = end_io_wq->private; - bio->bi_end_io = end_io_wq->end_io; - bio_endio(bio); - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); -} - static int cleaner_kthread(void *arg) { struct btrfs_fs_info *fs_info = arg; @@ -2272,10 +2173,14 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->delalloc_workers); btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); - btrfs_destroy_workqueue(fs_info->endio_workers); - btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + if (fs_info->endio_workers) + destroy_workqueue(fs_info->endio_workers); + if (fs_info->endio_raid56_workers) + destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); + if (fs_info->compressed_write_workers) + destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -2289,8 +2194,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) * the queues used for metadata I/O, since tasks from those other work * queues can do metadata I/O operations. */ - btrfs_destroy_workqueue(fs_info->endio_meta_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + if (fs_info->endio_meta_workers) + destroy_workqueue(fs_info->endio_meta_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2342,9 +2247,9 @@ void btrfs_put_root(struct btrfs_root *root) btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_lock); + spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_lock); + spin_unlock(&root->fs_info->fs_roots_radix_lock); #endif kfree(root); } @@ -2352,21 +2257,28 @@ void btrfs_put_root(struct btrfs_root *root) void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root; - unsigned long index = 0; + int ret; + struct btrfs_root *gang[8]; + int i; while (!list_empty(&fs_info->dead_roots)) { - root = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&root->root_list); + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) - btrfs_drop_and_free_fs_root(fs_info, root); - btrfs_put_root(root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + btrfs_put_root(gang[0]); } - xa_for_each(&fs_info->fs_roots, index, root) { - btrfs_drop_and_free_fs_root(fs_info, root); + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } @@ -2413,7 +2325,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); + BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; + BTRFS_I(inode)->location.type = 0; + BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); } @@ -2462,25 +2376,18 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->fixup_workers = btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); - /* - * endios are largely parallel and should have a very - * low idle thresh - */ fs_info->endio_workers = - btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); + alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta", flags, - max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, - max_active, 2); + alloc_workqueue("btrfs-endio-meta", flags, max_active); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, - max_active, 4); + alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); + fs_info->compressed_write_workers = + alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); @@ -2495,7 +2402,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) if (!(fs_info->workers && fs_info->hipri_workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && + fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && @@ -2522,6 +2429,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + btrfs_info(fs_info, "using %s (%s) checksum algorithm", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(csum_shash)); return 0; } @@ -3134,8 +3044,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); - xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -3143,7 +3053,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->fs_roots_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); @@ -3240,6 +3150,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); + init_waitqueue_head(&fs_info->zone_finish_wait); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; @@ -3247,6 +3158,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; + spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; @@ -3374,7 +3287,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load - * them into the fs_info->fs_roots. This must be done before + * them into the fs_info->fs_roots_radix tree. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount @@ -3578,16 +3491,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - /* - * Flag our filesystem as having big metadata blocks if they are bigger - * than the page size. - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); @@ -3625,8 +3528,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; - if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - btrfs_info(fs_info, "has skinny extents"); + /* + * Flag our filesystem as having big metadata blocks if they are bigger + * than the page size. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; /* * mixed block groups end up with duplicate but slightly offset @@ -3655,6 +3562,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device err = -EINVAL; goto fail_alloc; } + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata write, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (unlikely(features && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY))) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + features); + err = -EINVAL; + goto fail_alloc; + } + if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; @@ -4499,11 +4420,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, { bool drop_ref = false; - spin_lock(&fs_info->fs_roots_lock); - xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); - if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); @@ -4519,48 +4441,50 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *roots[8]; - unsigned long index = 0; - int i; + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; int err = 0; - int grabbed; + unsigned int ret = 0; while (1) { - struct btrfs_root *root; - - spin_lock(&fs_info->fs_roots_lock); - if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { - spin_unlock(&fs_info->fs_roots_lock); - return err; + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; } + root_objectid = gang[ret - 1]->root_key.objectid + 1; - grabbed = 0; - xa_for_each_start(&fs_info->fs_roots, index, root, index) { - /* Avoid grabbing roots in dead_roots */ - if (btrfs_root_refs(&root->root_item) > 0) - roots[grabbed++] = btrfs_grab_root(root); - if (grabbed >= ARRAY_SIZE(roots)) - break; + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_root(gang[i]); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); - for (i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - err = btrfs_orphan_cleanup(roots[i]); + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); if (err) - goto out; - btrfs_put_root(roots[i]); + break; + btrfs_put_root(gang[i]); } - index++; + root_objectid++; } -out: - /* Release the roots that remain uncleaned due to error */ - for (; i < grabbed; i++) { - if (roots[i]) - btrfs_put_root(roots[i]); + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); } return err; } @@ -4632,6 +4556,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4672,8 +4607,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->reclaim_bgs_work); - /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4870,28 +4803,31 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { - unsigned long index = 0; - int grabbed = 0; - struct btrfs_root *roots[8]; + struct btrfs_root *gang[8]; + u64 root_objectid = 0; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang))) != 0) { + int i; - spin_lock(&fs_info->fs_roots_lock); - while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, - ULONG_MAX, 8, XA_PRESENT))) { - for (int i = 0; i < grabbed; i++) - roots[i] = btrfs_grab_root(roots[i]); - spin_unlock(&fs_info->fs_roots_lock); + for (i = 0; i < ret; i++) + gang[i] = btrfs_grab_root(gang[i]); + spin_unlock(&fs_info->fs_roots_radix_lock); - for (int i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - btrfs_free_log(NULL, roots[i]); - btrfs_put_root(roots[i]); + root_objectid = gang[i]->root_key.objectid; + btrfs_free_log(NULL, gang[i]); + btrfs_put_root(gang[i]); } - index++; - spin_lock(&fs_info->fs_roots_lock); + root_objectid++; + spin_lock(&fs_info->fs_roots_radix_lock); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4ee8c42c9f78..8993b428e09c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -17,13 +17,6 @@ */ #define BTRFS_BDEV_BLOCKSIZE (4096) -enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA, - BTRFS_WQ_ENDIO_METADATA, - BTRFS_WQ_ENDIO_FREE_SPACE, - BTRFS_WQ_ENDIO_RAID56, -}; - static inline u64 btrfs_sb_offset(int mirror) { u64 start = SZ_16K; @@ -121,11 +114,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata); -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, @@ -145,8 +136,6 @@ int btree_lock_page_hook(struct page *page, void *data, int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); -int __init btrfs_end_io_wq_init(void); -void __cold btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_set_buffer_lockdep_class(u64 objectid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0867c5cd6e01..ea3ec1e761e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1269,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } -static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) +static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; @@ -1316,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_io_context *bioc = NULL; /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are discarding. + * Avoid races with device replace and make sure the devices in the + * stripes don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_io_stripe *stripe; + struct btrfs_discard_stripe *stripes; + unsigned int num_stripes; int i; num_bytes = end - cur; - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bioc, 0); - /* - * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or - * -EOPNOTSUPP. For any such error, @num_bytes is not updated, - * thus we can't continue anyway. - */ - if (ret < 0) - goto out; + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + if (IS_ERR(stripes)) { + ret = PTR_ERR(stripes); + if (ret == -EOPNOTSUPP) + ret = 0; + break; + } - stripe = bioc->stripes; - for (i = 0; i < bioc->num_stripes; i++, stripe++) { + for (i = 0; i < num_stripes; i++) { + struct btrfs_discard_stripe *stripe = stripes + i; u64 bytes; - struct btrfs_device *device = stripe->dev; - if (!device->bdev) { + if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &stripe->dev->dev_state)) continue; ret = do_discard_extent(stripe, &bytes); - if (!ret) { - discarded_bytes += bytes; - } else if (ret != -EOPNOTSUPP) { + if (ret) { /* - * Logic errors or -ENOMEM, or -EIO, but - * unlikely to happen. - * - * And since there are two loops, explicitly - * go to out to avoid confusion. + * Keep going if discard is not supported by the + * device. */ - btrfs_put_bioc(bioc); - goto out; + if (ret != -EOPNOTSUPP) + break; + ret = 0; + } else { + discarded_bytes += bytes; } - - /* - * Just in case we get back EOPNOTSUPP for some reason, - * just ignore the return value so we don't screw up - * people calling discard_extent. - */ - ret = 0; } - btrfs_put_bioc(bioc); + kfree(stripes); + if (ret) + break; cur += num_bytes; } -out: btrfs_bio_counter_dec(fs_info); - if (actual_bytes) *actual_bytes = discarded_bytes; - - - if (ret == -EOPNOTSUPP) - ret = 0; return ret; } @@ -3832,7 +3816,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { ret = 1; goto out; } @@ -3894,8 +3878,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + block_group->zoned_data_reloc_ongoing = 1; fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -3965,23 +3965,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } -static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) +static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + /* If we can activate new zone, just allocate a chunk and use it */ + if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return 0; + + /* + * We already reached the max active zones. Try to finish one block + * group to make a room for a new block group. This is only possible + * for a data block group because btrfs_zone_finish() may need to wait + * for a running transaction which can cause a deadlock for metadata + * allocation. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + int ret = btrfs_zone_finish_one_bg(fs_info); + + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + } + + /* + * If we have enough free space left in an already active block group + * and we can't activate any other zone now, do not allow allocating a + * new chunk and let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) + return -ENOSPC; + + /* + * Even min_alloc_size is not left in any block groups. Since we cannot + * activate a new block group, allocating it may not help. Let's tell a + * caller to try again and hope it progress something by writing some + * parts of the region. That is only possible for data block groups, + * where a part of the region can be written. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) + return -EAGAIN; + + /* + * We cannot activate a new block group and no enough space left in any + * block groups. So, allocating a new block group may not help. But, + * there is nothing to do anyway, so let's go with it. + */ + return 0; +} + +static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - return true; + return 0; case BTRFS_EXTENT_ALLOC_ZONED: - /* - * If we have enough free space left in an already - * active block group and we can't activate any other - * zone now, do not allow allocating a new chunk and - * let find_free_extent() retry with a smaller size. - */ - if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && - !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) - return false; - return true; + return can_allocate_chunk_zoned(fs_info, ffe_ctl); default: BUG(); } @@ -4063,8 +4103,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, int exist = 0; /*Check if allocation policy allows to create a new chunk */ - if (!can_allocate_chunk(fs_info, ffe_ctl)) - return -ENOSPC; + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; trans = current->journal_info; if (trans) @@ -5813,7 +5854,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); @@ -5976,7 +6017,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { - u64 start = SZ_1M, len = 0, end = 0; + u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; *trimmed = 0; @@ -6020,8 +6061,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) break; } - /* Ensure we skip the reserved area in the first 1M */ - start = max_t(u64, start, SZ_1M); + /* Ensure we skip the reserved space on each device. */ + start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); /* * If find_first_clear_extent_bit find a range that spans the diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f6b544ae616..bfae67c593c5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -144,6 +144,7 @@ struct tree_entry { */ struct btrfs_bio_ctrl { struct bio *bio; + int mirror_num; enum btrfs_compression_type compress_type; u32 len_to_stripe_boundary; u32 len_to_oe_boundary; @@ -178,61 +179,56 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, return ret; } -static void submit_one_bio(struct bio *bio, int mirror_num, - enum btrfs_compression_type compress_type) +static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { - struct extent_io_tree *tree = bio->bi_private; + struct bio *bio; + struct bio_vec *bv; + struct inode *inode; + int mirror_num; + + if (!bio_ctrl->bio) + return; - bio->bi_private = NULL; + bio = bio_ctrl->bio; + bv = bio_first_bvec_all(bio); + inode = bv->bv_page->mapping->host; + mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - if (is_data_inode(tree->private_data)) - btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - compress_type); - else - btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num); - /* - * Above submission hooks will handle the error by ending the bio, - * which will do the cleanup properly. So here we should not return - * any error, or the caller of submit_extent_page() will do cleanup - * again, causing problems. - */ -} + btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; -/* Cleanup unsubmitted bios */ -static void end_write_bio(struct extent_page_data *epd, int ret) -{ - struct bio *bio = epd->bio_ctrl.bio; + if (!is_data_inode(inode)) + btrfs_submit_metadata_bio(inode, bio, mirror_num); + else if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_submit_data_write_bio(inode, bio, mirror_num); + else + btrfs_submit_data_read_bio(inode, bio, mirror_num, + bio_ctrl->compress_type); - if (bio) { - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - epd->bio_ctrl.bio = NULL; - } + /* The bio is owned by the bi_end_io handler now */ + bio_ctrl->bio = NULL; } /* - * Submit bio from extent page data via submit_one_bio - * - * Return 0 if everything is OK. - * Return <0 for error. + * Submit or fail the current bio in an extent_page_data structure. */ -static void flush_write_bio(struct extent_page_data *epd) +static void submit_write_bio(struct extent_page_data *epd, int ret) { struct bio *bio = epd->bio_ctrl.bio; - if (bio) { - submit_one_bio(bio, 0, 0); - /* - * Clean up of epd->bio is handled by its endio function. - * And endio is either triggered by successful bio execution - * or the error handler of submit bio hook. - * So at this point, no matter what happened, we don't need - * to clean up epd->bio. - */ + if (!bio) + return; + + if (ret) { + ASSERT(ret < 0); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + /* The bio is owned by the bi_end_io handler now */ epd->bio_ctrl.bio = NULL; + } else { + submit_one_bio(&epd->bio_ctrl); } } @@ -376,131 +372,121 @@ void free_extent_state(struct extent_state *state) } } -static struct rb_node *tree_insert(struct rb_root *root, - struct rb_node *search_start, - u64 offset, - struct rb_node *node, - struct rb_node ***p_in, - struct rb_node **parent_in) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - if (p_in && parent_in) { - p = *p_in; - parent = *parent_in; - goto do_insert; - } - - p = search_start ? &search_start : &root->rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (offset < entry->start) - p = &(*p)->rb_left; - else if (offset > entry->end) - p = &(*p)->rb_right; - else - return parent; - } - -do_insert: - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - /** * Search @tree for an entry that contains @offset. Such entry would have * entry->start <= offset && entry->end >= offset. * * @tree: the tree to search * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * @p_ret: pointer where new node should be anchored (used when inserting an + * @node_ret: pointer where new node should be anchored (used when inserting an * entry in the tree) * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * - * This function returns a pointer to the entry that contains @offset byte - * address. If no such entry exists, then NULL is returned and the other - * pointer arguments to the function are filled, otherwise the found entry is - * returned and other pointers are left untouched. + * Return a pointer to the entry that contains @offset byte address and don't change + * @node_ret and @parent_ret. + * + * If no such entry exists, return pointer to entry that ends before @offset + * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. */ -static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **next_ret, - struct rb_node **prev_ret, - struct rb_node ***p_ret, - struct rb_node **parent_ret) +static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***node_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node **n = &root->rb_node; + struct rb_node **node = &root->rb_node; struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; - while (*n) { - prev = *n; + while (*node) { + prev = *node; entry = rb_entry(prev, struct tree_entry, rb_node); - prev_entry = entry; if (offset < entry->start) - n = &(*n)->rb_left; + node = &(*node)->rb_left; else if (offset > entry->end) - n = &(*n)->rb_right; + node = &(*node)->rb_right; else - return *n; + return *node; } - if (p_ret) - *p_ret = n; + if (node_ret) + *node_ret = node; if (parent_ret) *parent_ret = prev; - if (next_ret) { - orig_prev = prev; - while (prev && offset > prev_entry->end) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; + /* Search neighbors until we find the first one past the end */ + while (prev && offset > entry->end) { + prev = rb_next(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); } - if (prev_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - } - return NULL; + return prev; } -static inline struct rb_node * -tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***p_ret, - struct rb_node **parent_ret) +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) { - struct rb_node *next= NULL; - struct rb_node *ret; - - ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); - if (!ret) - return next; - return ret; + return tree_search_for_insert(tree, offset, NULL, NULL); } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +/** + * Search offset in the tree or fill neighbor rbtree node pointers. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * + * Return a pointer to the entry that contains @offset byte address. If no + * such entry exists, then return NULL and fill @prev_ret and @next_ret. + * Otherwise return the found entry and other pointers are left untouched. + */ +static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, + u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) { - return tree_search_for_insert(tree, offset, NULL, NULL); + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + + ASSERT(prev_ret); + ASSERT(next_ret); + + while (*node) { + prev = *node; + entry = rb_entry(prev, struct tree_entry, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return *node; + } + + orig_prev = prev; + while (prev && offset > entry->end) { + prev = rb_next(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + prev = orig_prev; + + entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < entry->start) { + prev = rb_prev(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + + return NULL; } /* @@ -554,7 +540,7 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 *bits, + struct extent_state *state, u32 bits, struct extent_changeset *changeset); /* @@ -568,37 +554,56 @@ static void set_state_bits(struct extent_io_tree *tree, * probably isn't what you want to call (see set/clear_extent_bit). */ static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, u64 start, u64 end, - struct rb_node ***p, - struct rb_node **parent, - u32 *bits, struct extent_changeset *changeset) + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) { - struct rb_node *node; - - if (end < start) { - btrfs_err(tree->fs_info, - "insert state: end < start %llu %llu", end, start); - WARN_ON(1); - } - state->start = start; - state->end = end; + struct rb_node **node; + struct rb_node *parent; + const u64 end = state->end; set_state_bits(tree, state, bits, changeset); - node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - found->start, found->end, start, end); - return -EEXIST; + node = &tree->state.rb_node; + while (*node) { + struct tree_entry *entry; + + parent = *node; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (end < entry->start) { + node = &(*node)->rb_left; + } else if (end > entry->end) { + node = &(*node)->rb_right; + } else { + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", + entry->start, entry->end, state->start, end); + return -EEXIST; + } } + + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); return 0; } /* + * Insert state to @tree to the location given by @node and @parent. + */ +static void insert_state_fast(struct extent_io_tree *tree, + struct extent_state *state, struct rb_node **node, + struct rb_node *parent, unsigned bits, + struct extent_changeset *changeset) +{ + set_state_bits(tree, state, bits, changeset); + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); +} + +/* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an * offset inside 'orig' where it should be split. @@ -615,7 +620,8 @@ static int insert_state(struct extent_io_tree *tree, static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { - struct rb_node *node; + struct rb_node *parent = NULL; + struct rb_node **node; if (tree->private_data && is_data_inode(tree->private_data)) btrfs_split_delalloc_extent(tree->private_data, orig, split); @@ -625,12 +631,27 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, - &prealloc->rb_node, NULL, NULL); - if (node) { - free_extent_state(prealloc); - return -EEXIST; + parent = &orig->rb_node; + node = &parent; + while (*node) { + struct tree_entry *entry; + + parent = *node; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (prealloc->end < entry->start) { + node = &(*node)->rb_left; + } else if (prealloc->end > entry->end) { + node = &(*node)->rb_right; + } else { + free_extent_state(prealloc); + return -EEXIST; + } } + + rb_link_node(&prealloc->rb_node, parent, node); + rb_insert_color(&prealloc->rb_node, &tree->state); + return 0; } @@ -652,11 +673,11 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 *bits, int wake, + u32 bits, int wake, struct extent_changeset *changeset) { struct extent_state *next; - u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; + u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -818,8 +839,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake, - changeset); + state = clear_state_bit(tree, state, bits, wake, changeset); goto next; } goto search_again; @@ -840,13 +860,13 @@ hit_next: if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, &bits, wake, changeset); + clear_state_bit(tree, prealloc, bits, wake, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, &bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, changeset); next: if (last_end == (u64)-1) goto out; @@ -937,9 +957,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - u32 *bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { - u32 bits_to_set = *bits & ~EXTENT_CTLBITS; + u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; if (tree->private_data && is_data_inode(tree->private_data)) @@ -1033,11 +1053,9 @@ again: if (!node) { prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, changeset); cache_state(prealloc, cached_state); prealloc = NULL; goto out; @@ -1060,7 +1078,7 @@ hit_next: goto out; } - set_state_bits(tree, state, &bits, changeset); + set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -1116,7 +1134,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits, changeset); + set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -1150,8 +1168,9 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, changeset); + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1179,7 +1198,7 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits, changeset); + set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1274,10 +1293,9 @@ again: err = -ENOMEM; goto out; } - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, NULL); - if (err) - extent_io_tree_panic(tree, err); + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, NULL); cache_state(prealloc, cached_state); prealloc = NULL; goto out; @@ -1294,9 +1312,9 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits, NULL); + set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, NULL); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1335,10 +1353,9 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits, NULL); + set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, - NULL); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1372,8 +1389,9 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, NULL); + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1398,9 +1416,9 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits, NULL); + set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); + clear_state_bit(tree, prealloc, clear_bits, 0, NULL); prealloc = NULL; goto out; } @@ -1674,7 +1692,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { - node = __etree_search(tree, start, &next, &prev, NULL, NULL); + node = tree_search_prev_next(tree, start, &prev, &next); if (!node && !next && !prev) { /* * Tree is completely empty, send full range and let @@ -2007,10 +2025,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + /* The sanity tests may not set a valid fs_info. */ + u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; bool found; @@ -2418,6 +2438,20 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) return ret; } +static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == failrec->num_copies) + return cur_mirror + 1 - failrec->num_copies; + return cur_mirror + 1; +} + +static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == 1) + return failrec->num_copies; + return cur_mirror - 1; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2430,7 +2464,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, u64 private; struct io_failure_record *failrec; struct extent_state *state; - int num_copies; + int mirror; int ret; private = 0; @@ -2454,20 +2488,19 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, EXTENT_LOCKED); spin_unlock(&io_tree->lock); - if (state && state->start <= failrec->start && - state->end >= failrec->start + failrec->len - 1) { - num_copies = btrfs_num_copies(fs_info, failrec->logical, - failrec->len); - if (num_copies > 1) { - repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, - failrec->failed_mirror); - } - } + if (!state || state->start > failrec->start || + state->end < failrec->start + failrec->len - 1) + goto out; + + mirror = failrec->this_mirror; + do { + mirror = prev_mirror(failrec, mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, mirror); + } while (mirror != failrec->failed_mirror); out: free_io_failure(failure_tree, io_tree, failrec); - return 0; } @@ -2506,17 +2539,16 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start) + struct btrfs_bio *bbio, + unsigned int bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_map *em; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - u64 logical; failrec = get_state_failrec(failure_tree, start); if (!IS_ERR(failrec)) { @@ -2528,7 +2560,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ - + ASSERT(failrec->this_mirror == bbio->mirror_num); + ASSERT(failrec->len == fs_info->sectorsize); return failrec; } @@ -2538,41 +2571,28 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->start = start; failrec->len = sectorsize; - failrec->this_mirror = 0; - failrec->compress_type = BTRFS_COMPRESS_NONE; + failrec->failed_mirror = bbio->mirror_num; + failrec->this_mirror = bbio->mirror_num; + failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return ERR_PTR(-EIO); - } + btrfs_debug(fs_info, + "new io failure record logical %llu start %llu", + failrec->logical, start); - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { + failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); + if (failrec->num_copies == 1) { + /* + * We only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. No + * matter what the error is, it is very likely to persist. + */ + btrfs_debug(fs_info, + "cannot repair logical %llu num_copies %d", + failrec->logical, failrec->num_copies); kfree(failrec); return ERR_PTR(-EIO); } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->compress_type = em->compress_type; - } - - btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); - /* Set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, EXTENT_LOCKED | EXTENT_DIRTY); @@ -2589,65 +2609,16 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, - struct io_failure_record *failrec, - int failed_mirror) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - /* The failure record should only contain one sector */ - ASSERT(failrec->len == fs_info->sectorsize); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - ASSERT(failed_mirror); - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - return true; -} - -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook) { + u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); + struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; @@ -2657,12 +2628,24 @@ int btrfs_repair_one_sector(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start); + failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); - - if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { + /* + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); + if (failrec->this_mirror == failrec->failed_mirror) { + btrfs_debug(fs_info, + "failed to repair num_copies %d this_mirror %d failed_mirror %d", + failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); free_io_failure(failure_tree, tree, failrec); return -EIO; } @@ -2695,7 +2678,7 @@ int btrfs_repair_one_sector(struct inode *inode, * will be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type); + submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); return BLK_STS_OK; } @@ -2727,21 +2710,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_data_read_repair(struct inode *inode, - struct bio *failed_bio, - u32 bio_offset, struct page *page, - unsigned int pgoff, - u64 start, u64 end, - int failed_mirror, - unsigned int error_bitmap) +static void end_sector_io(struct page *page, u64 offset, bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct extent_state *cached = NULL; + + end_page_read(page, uptodate, offset, sectorsize); + if (uptodate) + set_extent_uptodate(&inode->io_tree, offset, + offset + sectorsize - 1, &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(&inode->io_tree, offset, + offset + sectorsize - 1, &cached); +} + +static void submit_data_read_repair(struct inode *inode, + struct btrfs_bio *failed_bbio, + u32 bio_offset, const struct bio_vec *bvec, + unsigned int error_bitmap) { + const unsigned int pgoff = bvec->bv_offset; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct page *page = bvec->bv_page; + const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; + const u64 end = start + bvec->bv_len - 1; const u32 sectorsize = fs_info->sectorsize; const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int error = 0; int i; - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); /* This repair is only for data */ ASSERT(is_data_inode(inode)); @@ -2753,12 +2750,11 @@ static blk_status_t submit_data_read_repair(struct inode *inode, * We only get called on buffered IO, thus page must be mapped and bio * must not be cloned. */ - ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); /* Iterate through all the sectors in the range */ for (i = 0; i < nr_bits; i++) { const unsigned int offset = i * sectorsize; - struct extent_state *cached = NULL; bool uptodate = false; int ret; @@ -2771,10 +2767,9 @@ static blk_status_t submit_data_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bio, - bio_offset + offset, - page, pgoff + offset, start + offset, - failed_mirror, btrfs_submit_data_bio); + ret = btrfs_repair_one_sector(inode, failed_bbio, + bio_offset + offset, page, pgoff + offset, + btrfs_submit_data_read_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -2785,24 +2780,12 @@ static blk_status_t submit_data_read_repair(struct inode *inode, continue; } /* - * Repair failed, just record the error but still continue. - * Or the remaining sectors will not be properly unlocked. + * Continue on failed repair, otherwise the remaining sectors + * will not be properly unlocked. */ - if (!error) - error = ret; next: - end_page_read(page, uptodate, start + offset, sectorsize); - if (uptodate) - set_extent_uptodate(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached); + end_sector_io(page, start + offset, uptodate); } - return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2966,7 +2949,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a given bytenr. + * Find extent buffer for a givne bytenr. * * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking * in endio context. @@ -2985,9 +2968,11 @@ static struct extent_buffer *find_extent_buffer_readpage( return (struct extent_buffer *)page->private; } - /* For subpage case, we need to lookup extent buffer xarray */ - eb = xa_load(&fs_info->extent_buffers, - bytenr >> fs_info->sectorsize_bits); + /* For subpage case, we need to lookup buffer radix tree */ + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + bytenr >> fs_info->sectorsize_bits); + rcu_read_unlock(); ASSERT(eb); return eb; } @@ -3015,7 +3000,6 @@ static void end_bio_extent_readpage(struct bio *bio) */ u32 bio_offset = 0; int mirror; - int ret; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -3026,6 +3010,7 @@ static void end_bio_extent_readpage(struct bio *bio) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; unsigned int error_bitmap = (unsigned int)-1; + bool repair = false; u64 start; u64 end; u32 len; @@ -3063,57 +3048,23 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); - ret = error_bitmap; + if (error_bitmap) + uptodate = false; } else { - ret = btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror); + if (btrfs_validate_metadata_buffer(bbio, + page, start, end, mirror)) + uptodate = false; } - if (ret) - uptodate = false; - else - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, - page, - btrfs_ino(BTRFS_I(inode)), 0); } - if (likely(uptodate)) - goto readpage_ok; - - if (is_data_inode(inode)) { - /* - * If we failed to submit the IO at all we'll have a - * mirror_num == 0, in which case we need to just mark - * the page with an error and unlock it and carry on. - */ - if (mirror == 0) - goto readpage_ok; - - /* - * submit_data_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_data_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), - start, end, mirror, - error_bitmap); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } else { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); - } -readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, tree, start, page, + btrfs_ino(BTRFS_I(inode)), 0); + /* * Zero out the remaining part if this range straddles * i_size. @@ -3130,14 +3081,44 @@ readpage_ok: zero_user_segment(page, zero_start, offset_in_page(end) + 1); } + } else if (is_data_inode(inode)) { + /* + * Only try to repair bios that actually made it to a + * device. If the bio failed to be submitted mirror + * is 0 and we need to fail it without retrying. + * + * This also includes the high level bios for compressed + * extents - these never make it to a device and repair + * is already handled on the lower compressed bio. + */ + if (mirror > 0) + repair = true; + } else { + struct extent_buffer *eb; + + eb = find_extent_buffer_readpage(fs_info, page, start); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + } + + if (repair) { + /* + * submit_data_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. + */ + submit_data_read_repair(inode, bbio, bio_offset, bvec, + error_bitmap); + } else { + /* Update page status and unlock */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); } + ASSERT(bio_offset + len > bio_offset); bio_offset += len; - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -3206,19 +3187,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) return bio; } -struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio) -{ - struct btrfs_bio *bbio; - struct bio *new; - - /* Bio allocation backed by a bioset does not fail */ - new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(new); - btrfs_bio_init(bbio); - bbio->iter = bio->bi_iter; - return new; -} - struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; @@ -3357,7 +3325,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, static int alloc_new_bio(struct btrfs_inode *inode, struct btrfs_bio_ctrl *bio_ctrl, struct writeback_control *wbc, - unsigned int opf, + blk_opf_t opf, bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, enum btrfs_compression_type compress_type) @@ -3378,7 +3346,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; bio->bi_end_io = end_io_func; - bio->bi_private = &inode->io_tree; bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) @@ -3437,13 +3404,12 @@ error: * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @compress_type: compress type for current bio */ -static int submit_extent_page(unsigned int opf, +static int submit_extent_page(blk_opf_t opf, struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, bio_end_io_t end_io_func, - int mirror_num, enum btrfs_compression_type compress_type, bool force_bio_submit) { @@ -3455,10 +3421,8 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (force_bio_submit && bio_ctrl->bio) { - submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); - bio_ctrl->bio = NULL; - } + if (force_bio_submit) + submit_one_bio(bio_ctrl); while (cur < pg_offset + size) { u32 offset = cur - pg_offset; @@ -3498,8 +3462,7 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); - bio_ctrl->bio = NULL; + submit_one_bio(bio_ctrl); } cur += added; } @@ -3615,7 +3578,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, */ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start) + blk_opf_t read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3647,7 +3610,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (zero_offset) { iosize = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, iosize); - flush_dcache_page(page); } } begin_page_read(fs_info, page); @@ -3662,7 +3624,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3746,7 +3707,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct extent_state *cached = NULL; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -3779,10 +3739,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, bio_ctrl, page, disk_bytenr, iosize, - pg_offset, - end_bio_extent_readpage, 0, - this_bio_flag, - force_bio_submit); + pg_offset, end_bio_extent_readpage, + this_bio_flag, force_bio_submit); if (ret) { /* * We have to unlock the remaining range, or the page @@ -3815,8 +3773,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. */ - if (bio_ctrl.bio) - submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + submit_one_bio(&bio_ctrl); return ret; } @@ -3983,8 +3940,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int saved_ret = 0; int ret = 0; int nr = 0; - u32 opf = REQ_OP_WRITE; - const unsigned int write_flags = wbc_to_write_flags(wbc); + enum req_op op = REQ_OP_WRITE; + const blk_opf_t write_flags = wbc_to_write_flags(wbc); bool has_error = false; bool compressed; @@ -4058,7 +4015,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, iosize = min(min(em_end, end + 1), dirty_range_end) - cur; if (btrfs_use_zone_append(inode, em->block_start)) - opf = REQ_OP_ZONE_APPEND; + op = REQ_OP_ZONE_APPEND; free_extent_map(em); em = NULL; @@ -4094,12 +4051,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ btrfs_page_clear_dirty(fs_info, page, cur, iosize); - ret = submit_extent_page(opf | write_flags, wbc, + ret = submit_extent_page(op | write_flags, wbc, &epd->bio_ctrl, page, disk_bytenr, iosize, cur - page_offset(page), end_bio_extent_writepage, - 0, 0, false); + 0, false); if (ret) { has_error = true; if (!saved_ret) @@ -4164,10 +4121,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, return 0; } - if (page->index == end_index) { + if (page->index == end_index) memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); - flush_dcache_page(page); - } ret = set_page_extent_mapped(page); if (ret < 0) { @@ -4276,7 +4231,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; btrfs_tree_lock(eb); } @@ -4286,7 +4241,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; } while (1) { @@ -4333,7 +4288,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; } lock_page(p); @@ -4435,8 +4390,8 @@ static struct extent_buffer *find_extent_buffer_nolock( struct extent_buffer *eb; rcu_read_lock(); - eb = xa_load(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); return eb; @@ -4575,7 +4530,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); bool no_dirty_ebs = false; int ret; @@ -4594,7 +4549,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, &epd->bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, 0, false); + end_bio_subpage_eb_writepage, 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4620,7 +4575,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, { u64 disk_bytenr = eb->start; int i, num_pages; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); int ret = 0; prepare_eb_write(eb); @@ -4635,7 +4590,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, &epd->bio_ctrl, p, disk_bytenr, PAGE_SIZE, 0, end_bio_extent_buffer_writepage, - 0, 0, false); + 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4749,7 +4704,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - end_write_bio(epd, ret); + submit_write_bio(epd, ret); return ret; } @@ -4928,10 +4883,6 @@ retry: index = 0; goto retry; } - if (ret < 0) { - end_write_bio(&epd, ret); - goto out; - } /* * If something went wrong, don't allow any metadata write bio to be * submitted. @@ -4958,21 +4909,17 @@ retry: * Now such dirty tree block will not be cleaned by any dirty * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. - */ - if (!BTRFS_FS_ERROR(fs_info)) { - flush_write_bio(&epd); - } else { - ret = -EROFS; - end_write_bio(&epd, ret); - } -out: - btrfs_zoned_meta_io_unlock(fs_info); - /* + * * We can get ret > 0 from submit_extent_page() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) ret = 0; + if (!ret && BTRFS_FS_ERROR(fs_info)) + ret = -EROFS; + submit_write_bio(&epd, ret); + + btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -5074,7 +5021,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - flush_write_bio(epd); + submit_write_bio(epd, 0); lock_page(page); } @@ -5085,7 +5032,7 @@ retry: if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - flush_write_bio(epd); + submit_write_bio(epd, 0); wait_on_page_writeback(page); } @@ -5125,7 +5072,7 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - flush_write_bio(epd); + submit_write_bio(epd, 0); goto retry; } @@ -5136,26 +5083,6 @@ retry: return ret; } -int extent_write_full_page(struct page *page, struct writeback_control *wbc) -{ - int ret; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - - ret = __extent_writepage(page, wbc, &epd); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - - flush_write_bio(&epd); - return ret; -} - /* * Submit the pages in the range to bio for call sites which delalloc range has * already been ran (aka, ordered extent inserted) and all pages are still @@ -5213,10 +5140,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - if (!found_error) - flush_write_bio(&epd); - else - end_write_bio(&epd, ret); + submit_write_bio(&epd, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -5241,13 +5165,8 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); + submit_write_bio(&epd, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - flush_write_bio(&epd); return ret; } @@ -5269,9 +5188,7 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - - if (bio_ctrl.bio) - submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + submit_one_bio(&bio_ctrl); } /* @@ -6128,22 +6045,24 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6201,7 +6120,7 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return -EINVAL; } if (fs_info->nodesize >= PAGE_SIZE && - !IS_ALIGNED(start, PAGE_SIZE)) { + !PAGE_ALIGNED(start)) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", start, fs_info->nodesize); @@ -6318,22 +6237,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } /* add one reference for the tree */ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6378,8 +6300,10 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - xa_erase(&fs_info->extent_buffers, - eb->start >> fs_info->sectorsize_bits); + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> fs_info->sectorsize_bits); + spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); } @@ -6598,7 +6522,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6630,11 +6556,10 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, + ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), - end_bio_extent_readpage, mirror_num, 0, - true); + end_bio_extent_readpage, 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6643,10 +6568,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio_ctrl.bio) { - submit_one_bio(bio_ctrl.bio, mirror_num, 0); - bio_ctrl.bio = NULL; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6666,7 +6588,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6737,10 +6661,10 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + err = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, page_offset(page), PAGE_SIZE, 0, end_bio_extent_readpage, - mirror_num, 0, false); + 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6757,10 +6681,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio_ctrl.bio) { - submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type); - bio_ctrl.bio = NULL; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -7324,25 +7245,42 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *eb; - unsigned long index; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; + struct extent_buffer *found = NULL; u64 page_start = page_offset(page); + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - xa_for_each_start(&fs_info->extent_buffers, index, eb, - page_start >> fs_info->sectorsize_bits) { - if (in_range(eb->start, page_start, PAGE_SIZE)) - return eb; - else if (eb->start >= page_start + PAGE_SIZE) - /* Already beyond page end */ - return NULL; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } + } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } - return NULL; +out: + return found; } static int try_release_subpage_extent_buffer(struct page *page) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 23d4103c8831..4bc72a87b9a9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -57,6 +57,7 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; @@ -142,15 +143,10 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) struct extent_map_tree; -typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); - int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -247,7 +243,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); @@ -266,15 +261,13 @@ struct io_failure_record { u64 start; u64 len; u64 logical; - enum btrfs_compression_type compress_type; int this_mirror; int failed_mirror; + int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1fd827b99c1b..66c822182ecc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1848,7 +1848,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { - const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1902,15 +1901,6 @@ relock: } /* - * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() - * calls generic_write_sync() (through iomap_dio_complete()), because - * that results in calling fsync (btrfs_sync_file()) which will try to - * lock the inode in exclusive/write mode. - */ - if (is_sync_write) - iocb->ki_flags &= ~IOCB_DSYNC; - - /* * The iov_iter can be mapped to the same file range we are writing to. * If that's the case, then we will deadlock in the iomap code, because * it first calls our callback btrfs_dio_iomap_begin(), which will create @@ -1965,17 +1955,24 @@ again: btrfs_inode_unlock(inode, ilock_flags); /* - * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do - * the fsync (call generic_write_sync()). + * If 'err' is -ENOTBLK or we have not written all data, then it means + * we must fallback to buffered IO. */ - if (is_sync_write) - iocb->ki_flags |= IOCB_DSYNC; - - /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; buffered: + /* + * If we are in a NOWAIT context, then return -EAGAIN to signal the caller + * it must retry the operation in a context where blocking is acceptable, + * since we currently don't have NOWAIT semantics support for buffered IO + * and may block there for many reasons (reserving space for example). + */ + if (iocb->ki_flags & IOCB_NOWAIT) { + err = -EAGAIN; + goto out; + } + pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { @@ -2038,7 +2035,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - const bool sync = iocb->ki_flags & IOCB_DSYNC; + const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we @@ -2058,9 +2055,11 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; } else if (iocb->ki_flags & IOCB_DIRECT) { - num_written = num_sync = btrfs_direct_write(iocb, from); + num_written = btrfs_direct_write(iocb, from); + num_sync = num_written; } else { - num_written = num_sync = btrfs_buffered_write(iocb, from); + num_written = btrfs_buffered_write(iocb, from); + num_sync = num_written; } btrfs_set_inode_last_sub_trans(inode); @@ -2308,7 +2307,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } /* we've logged all the items and now have a consistent @@ -2323,25 +2322,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - if (ret != BTRFS_NO_LOG_SYNC) { + if (ret == BTRFS_NO_LOG_SYNC) { + ret = btrfs_end_transaction(trans); + goto out; + } + + /* We successfully logged the inode, attempt to sync the log. */ + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { - ret = btrfs_sync_log(trans, root, &ctx); - if (!ret) { - ret = btrfs_end_transaction(trans); - goto out; - } - } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, len); - if (ret) { - btrfs_end_transaction(trans); - goto out; - } + ret = btrfs_end_transaction(trans); + goto out; } - ret = btrfs_commit_transaction(trans); - } else { + } + + /* + * At this point we need to commit the transaction because we had + * btrfs_need_log_full_commit() or some other error. + * + * If we didn't do a full sync we have to stop the trans handle, wait on + * the ordered extents, start it again and commit the transaction. If + * we attempt to wait on the ordered extents here we could deadlock with + * something like fallocate() that is holding the extent lock trying to + * start a transaction while some other thread is trying to commit the + * transaction while we (fsync) are currently holding the transaction + * open. + */ + if (!full_sync) { ret = btrfs_end_transaction(trans); + if (ret) + goto out; + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) + goto out; + + /* + * This is safe to use here because we're only interested in + * making sure the transaction that had the ordered extents is + * committed. We aren't waiting on anything past this point, + * we're purely getting the transaction and committing it. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + + /* + * We committed the transaction and there's no currently + * running transaction, this means everything we care + * about made it to disk and we are done. + */ + if (ret == -ENOENT) + ret = 0; + goto out; + } } + + ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); @@ -2697,7 +2733,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, goto out; } rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; /* * 1 - update the inode @@ -2719,7 +2755,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + if (WARN_ON(ret)) + goto out_trans; trans->block_rsv = rsv; cur_offset = start; @@ -2803,6 +2840,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, extent_info->file_offset += replace_len; } + /* + * We are releasing our handle on the transaction, balance the + * dirty pages of the btree inode and flush delayed items, and + * then get a new transaction handle, which may now point to a + * new transaction in case someone else may have committed the + * transaction we used to replace/drop file extent items. So + * bump the inode's iversion and update mtime and ctime except + * if we are called from a dedupe context. This is because a + * power failure/crash may happen after the transaction is + * committed and before we finish replacing/dropping all the + * file extent items we need. + */ + inode_inc_iversion(&inode->vfs_inode); + + if (!extent_info || extent_info->update_times) { + inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; + } + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2819,7 +2875,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + if (WARN_ON(ret)) + break; trans->block_rsv = rsv; cur_offset = drop_args.drop_end; @@ -3042,7 +3099,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b1ae3ba2ca2c..996da650ecdc 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3536,7 +3536,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, * data, keep it dense. */ if (btrfs_test_opt(fs_info, SSD_SPREAD)) { - cont1_bytes = min_bytes = bytes + empty_size; + cont1_bytes = bytes + empty_size; + min_bytes = cont1_bytes; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; min_bytes = fs_info->sectorsize; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81737eff92f3..f0c97d25b4a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -114,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + unsigned long *nr_written, int unlock, + u64 *done_offset); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate); - /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * @@ -195,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = page_offset(locked_page); - u64 page_end = page_start + PAGE_SIZE - 1; - + u64 page_start, page_end; struct page *page; + if (locked_page) { + page_start = page_offset(locked_page); + page_end = page_start + PAGE_SIZE - 1; + } + while (index <= end_index) { /* * For locked page, we will call end_extent_writepage() on it @@ -212,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + if (locked_page && index == (page_start >> PAGE_SHIFT)) { index++; continue; } @@ -223,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, /* * Here we just clear all Ordered bits for every page in the - * range, then __endio_write_update_ordered() will handle + * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, @@ -231,20 +230,23 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, put_page(page); } - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being instantiated - * then skip it, since the first page of a range is going to be - * properly cleaned up by the caller of run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + if (locked_page) { + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_start + PAGE_SIZE) + return; + /* + * In case this page belongs to the delalloc range being + * instantiated then skip it, since the first page of a range is + * going to be properly cleaned up by the caller of + * run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; + } } - return __endio_write_update_ordered(inode, offset, bytes, false); + return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -332,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = kmap_local_page(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); i++; ptr += cur_size; @@ -345,9 +347,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } else { page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); write_extent_buffer(leaf, kaddr, ptr, size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_page(page); } btrfs_mark_buffer_dirty(leaf); @@ -485,7 +487,7 @@ struct async_chunk { struct page *locked_page; u64 start; u64 end; - unsigned int write_flags; + blk_opf_t write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; @@ -560,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, * will unlock the full page. */ if (fs_info->sectorsize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(end + 1, PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(end + 1)) return 0; } @@ -678,8 +680,8 @@ again: * Thus we must also check against @actual_end, not just @end. */ if (blocksize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(round_up(actual_end, blocksize))) goto cleanup_and_bail_uncompressed; } @@ -920,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, * can directly submit them without interruption. */ ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0); + &nr_written, 0, NULL); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) { ret = 0; goto out; } if (ret < 0) { - if (locked_page) + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + if (locked_page) { + const u64 page_start = page_offset(locked_page); + const u64 page_end = page_start + PAGE_SIZE - 1; + + btrfs_page_set_error(inode->root->fs_info, locked_page, + page_start, PAGE_SIZE); + set_page_writeback(locked_page); + end_page_writeback(locked_page); + end_extent_writepage(locked_page, ret, page_start, page_end); unlock_page(locked_page); + } goto out; } @@ -1133,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * *page_started is set to one if we unlock locked_page and do everything * required to start IO on it. It may be clean and already done with * IO when we return. + * + * When unlock == 1, we unlock the pages in successfully allocated regions. + * When unlock == 0, we leave them locked for writing them out. + * + * However, we unlock all the pages except @locked_page in case of failure. + * + * In summary, page locking state will be as follow: + * + * - page_started == 1 (return value) + * - All the pages are unlocked. IO is started. + * - Note that this can happen only on success + * - unlock == 1 + * - All the pages except @locked_page are unlocked in any case + * - unlock == 0 + * - On success, all the pages are locked for writing out them + * - On failure, all the pages except @locked_page are unlocked + * + * When a failure happens in the second or later iteration of the + * while-loop, the ordered extents created in previous iterations are kept + * intact. So, the caller must clean them up by calling + * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for + * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock) + unsigned long *nr_written, int unlock, + u64 *done_offset) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; + u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; @@ -1329,18 +1365,62 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + /* + * If done_offset is non-NULL and ret == -EAGAIN, we expect the + * caller to write out the successfully allocated region and retry. + */ + if (done_offset && ret == -EAGAIN) { + if (orig_start < start) + *done_offset = start - 1; + else + *done_offset = start; + return ret; + } else if (ret == -EAGAIN) { + /* Convert to -ENOSPC since the caller cannot retry. */ + ret = -ENOSPC; + } + + /* + * Now, we have three regions to clean up: + * + * |-------(1)----|---(2)---|-------------(3)----------| + * `- orig_start `- start `- start + cur_alloc_size `- end + * + * We process each region below. + */ + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + /* + * For the range (1). We have already instantiated the ordered extents + * for this region. They are cleaned up by + * btrfs_cleanup_ordered_extents() in e.g, + * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are + * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | + * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup + * function. + * + * However, in case of unlock == 0, we still need to unlock the pages + * (except @locked_page) to ensure all the pages are unlocked. + */ + if (!unlock && orig_start < start) { + if (!locked_page) + mapping_set_error(inode->vfs_inode.i_mapping, ret); + extent_clear_unlock_delalloc(inode, orig_start, start - 1, + locked_page, 0, page_ops); + } + /* - * If we reserved an extent for our delalloc range (or a subrange) and - * failed to create the respective ordered extent, then it means that - * when we reserved the extent we decremented the extent's size from - * the data space_info's bytes_may_use counter and incremented the - * space_info's bytes_reserved counter by the same amount. We must make - * sure extent_clear_unlock_delalloc() does not try to decrement again - * the data space_info's bytes_may_use counter, therefore we do not pass - * it the flag EXTENT_CLEAR_DATA_RESV. + * For the range (2). If we reserved an extent for our delalloc range + * (or a subrange) and failed to create the respective ordered extent, + * then it means that when we reserved the extent we decremented the + * extent's size from the data space_info's bytes_may_use counter and + * incremented the space_info's bytes_reserved counter by the same + * amount. We must make sure extent_clear_unlock_delalloc() does not try + * to decrement again the data space_info's bytes_may_use counter, + * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, @@ -1350,12 +1430,19 @@ out_unlock: page_ops); start += cur_alloc_size; if (start >= end) - goto out; + return ret; } + + /* + * For the range (3). We never touched the region. In addition to the + * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data + * space_info's bytes_may_use counter, reserved in + * btrfs_check_data_free_space(). + */ extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); - goto out; + return ret; } /* @@ -1435,7 +1522,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, int i; bool should_compress; unsigned nofs_flag; - const unsigned int write_flags = wbc_to_write_flags(wbc); + const blk_opf_t write_flags = wbc_to_write_flags(wbc); unlock_extent(&inode->io_tree, start, end); @@ -1538,19 +1625,42 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, u64 end, int *page_started, unsigned long *nr_written) { + u64 done_offset = end; int ret; + bool locked_page_done = false; - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0); - if (ret) - return ret; + while (start <= end) { + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0, &done_offset); + if (ret && ret != -EAGAIN) + return ret; - if (*page_started) - return 0; + if (*page_started) { + ASSERT(ret == 0); + return 0; + } + + if (ret == 0) + done_offset = end; + + if (done_offset == start) { + struct btrfs_fs_info *info = inode->root->fs_info; + + wait_var_event(&info->zone_finish_wait, + !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); + continue; + } + + if (!locked_page_done) { + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + } + locked_page_done = true; + extent_write_locked_range(&inode->vfs_inode, start, done_offset); + + start = done_offset + 1; + } - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; @@ -1642,7 +1752,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, } return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1); + nr_written, 1, NULL); } struct can_nocow_file_extent_args { @@ -2115,7 +2225,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page page_started, nr_written); else ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, @@ -2131,6 +2241,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 size; /* not delalloc, ignore it */ @@ -2138,7 +2249,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; size = orig->end - orig->start + 1; - if (size > BTRFS_MAX_EXTENT_SIZE) { + if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; @@ -2147,10 +2258,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * applies here, just in reverse. */ new_size = orig->end - split + 1; - num_extents = count_max_extents(new_size); + num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; - num_extents += count_max_extents(new_size); - if (count_max_extents(size) >= num_extents) + num_extents += count_max_extents(fs_info, new_size); + if (count_max_extents(fs_info, size) >= num_extents) return; } @@ -2167,6 +2278,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size, old_size; u32 num_extents; @@ -2180,7 +2292,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ - if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + if (new_size <= fs_info->max_extent_size) { spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); @@ -2206,10 +2318,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, * this case. */ old_size = other->end - other->start + 1; - num_extents = count_max_extents(old_size); + num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; - num_extents += count_max_extents(old_size); - if (count_max_extents(new_size) >= num_extents) + num_extents += count_max_extents(fs_info, old_size); + if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -2274,21 +2386,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits) + u32 bits) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); spin_lock(&BTRFS_I(inode)->lock); @@ -2303,7 +2415,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - if (*bits & EXTENT_DEFRAG) + if (bits & EXTENT_DEFRAG) BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) @@ -2312,7 +2424,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, } if (!(state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - state->start; @@ -2325,14 +2437,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * accounting happens. */ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, - struct extent_state *state, unsigned *bits) + struct extent_state *state, u32 bits) { struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); - if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); @@ -2343,7 +2455,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); @@ -2356,7 +2468,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * don't need to call delalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_CLEAR_META_RESV && + if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, false); @@ -2366,7 +2478,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && - (*bits & EXTENT_CLEAR_DATA_RESV)) + (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, @@ -2381,11 +2493,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } if ((state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; - if (*bits & EXTENT_ADD_INODE_BYTES) + if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } @@ -2580,95 +2692,78 @@ out: return errno_to_blk_status(ret); } -/* - * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read. - * - * Rules about async/sync submit, - * a) read: sync submit - * - * b) write without checksum: sync submit - * - * c) write with checksum: - * c-1) if bio is issued by fsync: sync submit - * (sync_writers != 0) - * - * c-2) if root is reloc root: sync submit - * (only in case of buffered IO) - * - * c-3) otherwise: async submit - */ -void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - blk_status_t ret = 0; - int skip_sum; - int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - - skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - - if (btrfs_is_free_space_inode(BTRFS_I(inode))) - metadata = BTRFS_WQ_ENDIO_FREE_SPACE; + struct btrfs_inode *bi = BTRFS_I(inode); + blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *page = bio_first_bvec_all(bio)->bv_page; - loff_t file_offset = page_offset(page); - - ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + ret = extract_ordered_extent(bi, bio, + page_offset(bio_first_bvec_all(bio)->bv_page)); if (ret) goto out; } - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); - if (ret) - goto out; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing - * the bio if there were any errors, so just return - * here. - */ - btrfs_submit_compressed_read(inode, bio, mirror_num); + /* + * If we need to checksum, and the I/O is not issued by fsync and + * friends, that is ->sync_writers != 0, defer the submission to a + * workqueue to parallelize it. + * + * Csum items for reloc roots have already been cloned at this point, + * so they are handled as part of the no-checksum case. + */ + if (!(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(bi->root)) { + if (!atomic_read(&bi->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + btrfs_submit_bio_start)) return; - } else { - /* - * Lookup bio sums does extra checks around whether we - * need to csum or not, which is why we ignore skip_sum - * here. - */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); - if (ret) - goto out; - } - goto mapit; - } else if (async && !skip_sum) { - /* csum items have already been cloned */ - if (btrfs_is_data_reloc_root(root)) - goto mapit; - /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, - 0, btrfs_submit_bio_start); - goto out; - } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); + + ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); if (ret) goto out; } + btrfs_submit_bio(fs_info, bio, mirror_num); + return; +out: + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + } +} -mapit: - ret = btrfs_map_bio(fs_info, bio, mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + blk_status_t ret; -out: + if (compress_type != BTRFS_COMPRESS_NONE) { + /* + * btrfs_submit_compressed_read will handle completing the bio + * if there were any errors, so just return here. + */ + btrfs_submit_compressed_read(inode, bio, mirror_num); + return; + } + + /* Save the original iter for read repair */ + btrfs_bio(bio)->iter = bio->bi_iter; + + /* + * Lookup bio sums does extra checks around whether we need to csum or + * not, which is why we ignore skip_sum here. + */ + ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) { bio->bi_status = ret; bio_endio(bio); + return; } + + btrfs_submit_bio(fs_info, bio, mirror_num); } /* @@ -3075,8 +3170,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - num_bytes = ram_bytes = oe->truncated_len; + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + num_bytes = oe->truncated_len; + ram_bytes = num_bytes; + } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3102,7 +3199,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3195,6 +3292,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->file_offset + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); @@ -3309,65 +3408,71 @@ out: return ret; } -static void finish_ordered_fn(struct btrfs_work *work) -{ - struct btrfs_ordered_extent *ordered_extent; - ordered_extent = container_of(work, struct btrfs_ordered_extent, work); - btrfs_finish_ordered_io(ordered_extent); -} - void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, - finish_ordered_fn, uptodate); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); +} + +/* + * Verify the checksum for a single sector without any extra action that depend + * on the type of I/O. + */ +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected) +{ + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + char *kaddr; + + ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); + + shash->tfm = fs_info->csum_shash; + + kaddr = kmap_local_page(page) + pgoff; + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + kunmap_local(kaddr); + + if (memcmp(csum, csum_expected, fs_info->csum_size)) + return -EIO; + return 0; } /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode - * @io_bio: btrfs_io_bio which contains the csum + * @bbio: btrfs_bio which contains the csum * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page - * @start: logical offset in the file * * The length of such check is always one sector size. + * + * When csum mismatch is detected, we will also report the error and fill the + * corrupted range with zero. (Thus it needs the extra parameters) */ -static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff, - u64 start) +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; u32 len = fs_info->sectorsize; - const u32 csum_size = fs_info->csum_size; - unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(pgoff + len <= PAGE_SIZE); - offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); - kaddr = kmap_atomic(page); - shash->tfm = fs_info->csum_shash; - - crypto_shash_digest(shash, kaddr + pgoff, len, csum); - kunmap_atomic(kaddr); - - if (memcmp(csum, csum_expected, csum_size)) + if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) goto zeroit; - return 0; + zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - bbio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), + bbio->file_offset + bio_offset, + csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -3399,11 +3504,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 pg_off; unsigned int result = 0; - if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { - btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); - return 0; - } - /* * This only happens for NODATASUM or compressed read. * Normally this should be covered by above check for compressed read @@ -3436,8 +3536,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, - page_offset(page) + pg_off); + ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; @@ -3576,7 +3675,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - /* Bail out if the cleanup is already running. */ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; @@ -3659,17 +3757,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into - * fs_info->fs_roots. So here we can find if an + * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking - * up the root from that xarray. + * up the root from that radix tree. */ - spin_lock(&fs_info->fs_roots_lock); - dead_root = xa_load(&fs_info->fs_roots, - (unsigned long)found_key.objectid); + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ @@ -3909,7 +4007,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in the delayed_nodes xarray. + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -4227,7 +4325,7 @@ skip_backref: /* * If we are in a rename context, we don't need to update anything in the * log. That will be done later during the rename by btrfs_log_new_name(). - * Besides that, doing it here would only cause extra unncessary btree + * Besides that, doing it here would only cause extra unnecessary btree * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { @@ -4255,8 +4353,9 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = - dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; + dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4418,7 +4517,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); @@ -4857,7 +4957,6 @@ again: else memzero_page(page, (block_start - page_offset(page)) + offset, len); - flush_dcache_page(page); } btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); @@ -5060,9 +5159,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ if (newsize != oldsize) { inode_inc_iversion(inode); - if (!(mask & (ATTR_CTIME | ATTR_MTIME))) - inode->i_ctime = inode->i_mtime = - current_time(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } } if (newsize > oldsize) { @@ -5370,7 +5470,7 @@ void btrfs_evict_inode(struct inode *inode) if (!rsv) goto no_delete; rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5762,14 +5862,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, sub_root); + inode = new_simple_dir(dir->i_sb, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); - } - if (root != sub_root) btrfs_put_root(sub_root); - if (!IS_ERR(inode) && root != sub_root) { + if (IS_ERR(inode)) + return inode; + down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); @@ -6365,7 +6465,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(path); + /* + * We don't need the path anymore, plus inheriting properties, adding + * ACLs, security xattrs, orphan item or adding the link, will result in + * allocating yet another path. So just free our path. + */ + btrfs_free_path(path); + path = NULL; if (args->subvol) { struct inode *parent; @@ -6422,8 +6528,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - ret = 0; - goto out; + return 0; discard: /* @@ -7505,7 +7610,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - *map = em = em2; + *map = em2; + em = em2; } if (IS_ERR(em2)) { @@ -7587,8 +7693,12 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, const u64 data_alloc_len = length; bool unlock_extents = false; + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ if (!write) - len = min_t(u64, len, fs_info->sectorsize); + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); lockstart = start; lockend = start + len - 1; @@ -7679,7 +7789,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); - ret = -ENOTBLK; + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } @@ -7811,8 +7933,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos += submitted; length -= submitted; if (write) - __endio_write_update_ordered(BTRFS_I(inode), pos, - length, false); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); @@ -7834,10 +7956,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->file_offset, - dip->bytes, - !dip->bio.bi_status); + btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + dip->file_offset, dip->bytes, + !dip->bio.bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, @@ -7857,12 +7978,8 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, BUG_ON(bio_op(bio) == REQ_OP_WRITE); - if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA)) - return; - refcount_inc(&dip->refs); - if (btrfs_map_bio(fs_info, bio, mirror_num)) - refcount_dec(&dip->refs); + btrfs_submit_bio(fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, @@ -7871,56 +7988,35 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - struct bio_vec bvec; - struct bvec_iter iter; - u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (uptodate && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, + bv.bv_offset))) { + clean_io_failure(fs_info, failure_tree, io_tree, start, + bv.bv_page, btrfs_ino(BTRFS_I(inode)), + bv.bv_offset); + } else { + int ret; - __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec.bv_offset; - for (i = 0; i < nr_sectors; i++) { - u64 start = bbio->file_offset + bio_offset; - - ASSERT(pgoff < PAGE_SIZE); - if (uptodate && - (!csum || !check_data_csum(inode, bbio, - bio_offset, bvec.bv_page, - pgoff, start))) { - clean_io_failure(fs_info, failure_tree, io_tree, - start, bvec.bv_page, - btrfs_ino(BTRFS_I(inode)), - pgoff); - } else { - int ret; - - ret = btrfs_repair_one_sector(inode, &bbio->bio, - bio_offset, bvec.bv_page, pgoff, - start, bbio->mirror_num, - submit_dio_repair_bio); - if (ret) - err = errno_to_blk_status(ret); - } - ASSERT(bio_offset + sectorsize > bio_offset); - bio_offset += sectorsize; - pgoff += sectorsize; + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } } - return err; -} -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate) -{ - btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, - finish_ordered_fn, uptodate); + return err; } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -7955,51 +8051,43 @@ static void btrfs_end_dio_bio(struct bio *bio) btrfs_dio_private_put(dip); } -static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, - struct inode *inode, u64 file_offset, int async_submit) +static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; - /* Check btrfs_submit_bio_hook() for rules about async submit. */ - if (async_submit) - async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); - - if (!write) { - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - goto err; - } + /* Save the original iter for read repair */ + if (btrfs_op(bio) == BTRFS_MAP_READ) + btrfs_bio(bio)->iter = bio->bi_iter; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; - if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io); - goto err; - } else if (write) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + /* Check btrfs_submit_data_write_bio() for async submit rules */ + if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && + btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_submit_bio_start_direct_io)) + return; + /* * If we aren't doing async submit, calculate the csum of the * bio now. */ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); - if (ret) - goto err; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + return; + } } else { - u64 csum_offset; - - csum_offset = file_offset - dip->file_offset; - csum_offset >>= fs_info->sectorsize_bits; - csum_offset *= fs_info->csum_size; - btrfs_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, + file_offset - dip->file_offset); } map: - ret = btrfs_map_bio(fs_info, bio, 0); -err: - return ret; + btrfs_submit_bio(fs_info, bio, 0); } static void btrfs_submit_direct(const struct iomap_iter *iter, @@ -8112,14 +8200,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - status = btrfs_submit_dio_bio(bio, inode, file_offset, - async_submit); - if (status) { - bio_put(bio); - if (submit_len > 0) - refcount_dec(&dip->refs); - goto out_err_em; - } + btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; @@ -8152,7 +8233,8 @@ ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_befo struct btrfs_dio_data data; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC, + &data, done_before); } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -8167,31 +8249,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - int ret; - - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - /* - * If we are under memory pressure we will call this directly from the - * VM, we need to make sure we have the inode referenced for the ordered - * extent. If not just return like we didn't do anything. - */ - if (!igrab(inode)) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - ret = extent_write_full_page(page, wbc); - btrfs_add_delayed_iput(inode); - return ret; -} - static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -8255,30 +8312,24 @@ static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) } #ifdef CONFIG_MIGRATION -static int btrfs_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, +static int btrfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - int ret; + int ret = filemap_migrate_folio(mapping, dst, src, mode); - ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) - attach_page_private(newpage, detach_page_private(page)); - - if (PageOrdered(page)) { - ClearPageOrdered(page); - SetPageOrdered(newpage); + if (folio_test_ordered(src)) { + folio_clear_ordered(src); + folio_set_ordered(dst); } - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } +#else +#define btrfs_migrate_folio NULL #endif static void btrfs_invalidate_folio(struct folio *folio, size_t offset, @@ -8495,7 +8546,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepage() function could + * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ @@ -8586,10 +8637,9 @@ again: else zero_start = PAGE_SIZE; - if (zero_start != PAGE_SIZE) { + if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - flush_dcache_page(page); - } + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); @@ -8672,7 +8722,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) if (!rsv) return -ENOMEM; rsv->size = min_size; - rsv->failfast = 1; + rsv->failfast = true; /* * 1 for the truncate slack space @@ -9193,8 +9243,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; + old_dir->i_mtime = ctime; + old_dir->i_ctime = ctime; + new_dir->i_mtime = ctime; + new_dir->i_ctime = ctime; old_inode->i_ctime = ctime; new_inode->i_ctime = ctime; @@ -9457,9 +9509,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - old_inode->i_ctime = current_time(old_dir); + old_dir->i_mtime = current_time(old_dir); + old_dir->i_ctime = old_dir->i_mtime; + new_dir->i_mtime = old_dir->i_mtime; + new_dir->i_ctime = old_dir->i_mtime; + old_inode->i_ctime = old_dir->i_mtime; if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9547,15 +9601,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int ret; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return btrfs_rename_exchange(old_dir, old_dentry, new_dir, - new_dentry); + ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + else + ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); + + btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); - return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, - new_dentry, flags); + return ret; } struct btrfs_delalloc_work { @@ -9897,6 +9957,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; + extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; @@ -10174,9 +10235,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } -static int btrfs_encoded_io_compression_from_extent( - struct btrfs_fs_info *fs_info, - int compress_type) +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type) { switch (compress_type) { case BTRFS_COMPRESS_NONE: @@ -10299,7 +10359,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { struct btrfs_encoded_read_private *priv = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; @@ -10309,19 +10368,9 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, return ret; } - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) { - btrfs_bio_free_csum(bbio); - return ret; - } - atomic_inc(&priv->pending); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) { - atomic_dec(&priv->pending); - btrfs_bio_free_csum(bbio); - } - return ret; + btrfs_submit_bio(fs_info, bio, mirror_num); + return BLK_STS_OK; } static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) @@ -10333,7 +10382,6 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) u32 sectorsize = fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; - u64 start = priv->file_offset; u32 bio_offset = 0; if (priv->skip_csum || !uptodate) @@ -10346,10 +10394,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, - bvec->bv_page, pgoff, start)) + if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff)) return BLK_STS_IOERR; - start += sectorsize; bio_offset += sectorsize; pgoff += sectorsize; } @@ -10381,11 +10428,9 @@ static void btrfs_encoded_read_endio(struct bio *bio) bio_put(bio); } -static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, - u64 disk_bytenr, - u64 disk_io_size, - struct page **pages) +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { @@ -10616,7 +10661,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -ENOBUFS; goto out_em; } - disk_io_size = count = em->block_len; + disk_io_size = em->block_len; + count = em->block_len; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, @@ -10779,15 +10825,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = -ENOMEM; goto out_pages; } - kaddr = kmap(pages[i]); + kaddr = kmap_local_page(pages[i]); if (copy_from_iter(kaddr, bytes, from) != bytes) { - kunmap(pages[i]); + kunmap_local(kaddr); ret = -EFAULT; goto out_pages; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap(pages[i]); + kunmap_local(kaddr); } for (;;) { @@ -11416,15 +11462,12 @@ static const struct file_operations btrfs_dir_file_operations = { */ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, - .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, .invalidate_folio = btrfs_invalidate_folio, .release_folio = btrfs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = btrfs_migratepage, -#endif + .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f79af919bc4..fe0cc816b4eb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1230,16 +1230,18 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } -static u32 get_extent_max_capacity(const struct extent_map *em) +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) { if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) return BTRFS_MAX_COMPRESSED; - return BTRFS_MAX_EXTENT_SIZE; + return fs_info->max_extent_size; } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *next; bool ret = false; @@ -1263,7 +1265,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, * If the next extent is at its max capacity, defragging current extent * makes no sense, as the total number of extents won't change. */ - if (next->len >= get_extent_max_capacity(em)) + if (next->len >= get_extent_max_capacity(fs_info, em)) goto out; /* Skip older extent */ if (next->generation < newer_than) @@ -1400,6 +1402,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool locked, struct list_head *target_list, u64 *last_scanned_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1484,7 +1487,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * Skip extents already at its max capacity, this is mostly for * compressed extents, which max cap is only 128K. */ - if (em->len >= get_extent_max_capacity(em)) + if (em->len >= get_extent_max_capacity(fs_info, em)) goto next; /* @@ -4243,26 +4246,6 @@ out: return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct btrfs_data_container *inodes = ctx; - const size_t c = 3 * sizeof(u64); - - if (inodes->bytes_left >= c) { - inodes->bytes_left -= c; - inodes->val[inodes->elem_cnt] = inum; - inodes->val[inodes->elem_cnt + 1] = offset; - inodes->val[inodes->elem_cnt + 2] = root; - inodes->elem_cnt += 3; - } else { - inodes->bytes_missing += c - inodes->bytes_left; - inodes->bytes_left = 0; - inodes->elem_missed += 3; - } - - return 0; -} - static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg, int version) { @@ -4312,7 +4295,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes, ignore_offset); + inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4355,13 +4338,79 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } +/** + * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as + * required. + * + * @fs_info: the filesystem + * @excl_acquired: ptr to boolean value which is set to false in case balance + * is being resumed + * + * Return 0 on success in which case both fs_info::balance is acquired as well + * as exclusive ops are blocked. In case of failure return an error code. + */ +static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired) +{ + int ret; + + /* + * Exclusive operation is locked. Three possibilities: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + while (1) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + *excl_acquired = true; + mutex_lock(&fs_info->balance_mutex); + return 0; + } + + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* This is either (2) or (3) */ + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (2) */ + ret = -EINPROGRESS; + goto out_failure; + + } else { + mutex_unlock(&fs_info->balance_mutex); + /* + * Lock released to allow other waiters to + * continue, we'll reexamine the status again. + */ + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (3) */ + *excl_acquired = false; + return 0; + } + } + } else { + /* This is (1) */ + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_failure; + } + + mutex_unlock(&fs_info->balance_mutex); + } + +out_failure: + mutex_unlock(&fs_info->balance_mutex); + *excl_acquired = false; + return ret; +} + static long btrfs_ioctl_balance(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; - bool need_unlock; /* for mut. excl. ops lock */ + bool need_unlock = true; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -4378,53 +4427,12 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) goto out; } -again: - if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - mutex_lock(&fs_info->balance_mutex); - need_unlock = true; - goto locked; - } - - /* - * mut. excl. ops lock is locked. Three possibilities: - * (1) some other op is running - * (2) balance is running - * (3) balance is paused -- special case (think resume) - */ - mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - /* this is either (2) or (3) */ - if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); - /* - * Lock released to allow other waiters to continue, - * we'll reexamine the status again. - */ - mutex_lock(&fs_info->balance_mutex); - - if (fs_info->balance_ctl && - !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - /* this is (3) */ - need_unlock = false; - goto locked; - } - - mutex_unlock(&fs_info->balance_mutex); - goto again; - } else { - /* this is (2) */ - mutex_unlock(&fs_info->balance_mutex); - ret = -EINPROGRESS; - goto out; - } - } else { - /* this is (1) */ - mutex_unlock(&fs_info->balance_mutex); - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = btrfs_try_lock_balance(fs_info, &need_unlock); + if (ret) goto out; - } -locked: + lockdep_assert_held(&fs_info->balance_mutex); + if (bargs->flags & BTRFS_BALANCE_RESUME) { if (!fs_info->balance_ctl) { ret = -ENOTCONN; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 313d9d685adb..33461b4f9c8b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne start_ns = ktime_get_ns(); down_read_nested(&eb->lock, nest); - eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { - eb->lock_owner = current->pid; trace_btrfs_try_tree_read_lock(eb); return 1; } @@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - eb->lock_owner = 0; up_read(&eb->lock); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 430ad36b8b08..89bc5f825e0a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -155,7 +155,7 @@ static int copy_compressed_data_to_page(char *compressed_data, out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -167,7 +167,7 @@ static int copy_compressed_data_to_page(char *compressed_data, u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, orig_out + compressed_size - *cur_out); - kunmap(cur_page); + kunmap_local(kaddr); if ((*cur_out / PAGE_SIZE) >= max_nr_page) return -E2BIG; @@ -180,7 +180,7 @@ static int copy_compressed_data_to_page(char *compressed_data, return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -202,7 +202,7 @@ static int copy_compressed_data_to_page(char *compressed_data, *cur_out += sector_bytes_left; out: - kunmap(cur_page); + kunmap_local(kaddr); return 0; } @@ -248,12 +248,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap(page_in); + data_in = kmap_local_page(page_in); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, workspace->mem); - kunmap(page_in); + kunmap_local(data_in); if (ret < 0) { pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; @@ -310,7 +310,6 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -318,11 +317,8 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; - kaddr = kmap(cur_page); - memcpy(dest + *cur_in - orig_in, - kaddr + offset_in_page(*cur_in), - copy_len); - kunmap(cur_page); + memcpy_from_page(dest + *cur_in - orig_in, cur_page, + offset_in_page(*cur_in), copy_len); *cur_in += copy_len; } @@ -342,9 +338,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap(cb->compressed_pages[0]); + kaddr = kmap_local_page(cb->compressed_pages[0]); len_in = read_compress_length(kaddr); - kunmap(cb->compressed_pages[0]); + kunmap_local(kaddr); cur_in += LZO_LEN; /* @@ -378,9 +374,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; ASSERT(cur_page); - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); - kunmap(cur_page); + kunmap_local(kaddr); cur_in += LZO_LEN; if (seg_len > WORKSPACE_CBUF_LENGTH) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1957b14b329a..1952ac85222c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -272,25 +272,30 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, spin_unlock_irq(&tree->lock); } +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); +} + /* * Mark all ordered extents io inside the specified range finished. * - * @page: The invovled page for the opeartion. + * @page: The involved page for the operation. * For uncompressed buffered IO, the page status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the * endio function twice. - * @finish_func: The function to be executed when all the IO of an ordered - * extent are finished. * * This function is called for endio, thus the range must have ordered - * extent(s) coveri it. + * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate) + struct page *page, u64 file_offset, + u64 num_bytes, bool uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -401,8 +406,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); cond_wake_up(&entry->wait); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_mark_finished(inode, entry); spin_unlock_irqrestore(&tree->lock, flags); - btrfs_init_work(&entry->work, finish_func, NULL, NULL); + btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &entry->work); spin_lock_irqsave(&tree->lock, flags); } @@ -473,6 +479,7 @@ out: if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } spin_unlock_irqrestore(&tree->lock, flags); return finished; @@ -807,8 +814,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) entry = NULL; - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup(inode, entry); + } out: spin_unlock_irqrestore(&tree->lock, flags); return entry; @@ -848,8 +857,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( break; } out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_range(inode, entry); + } spin_unlock_irq(&tree->lock); return entry; } @@ -878,6 +889,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, ASSERT(list_empty(&ordered->log_list)); list_add_tail(&ordered->log_list, list); refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } spin_unlock_irq(&tree->lock); } @@ -901,6 +913,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first(inode, entry); out: spin_unlock_irq(&tree->lock); return entry; @@ -975,8 +988,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( /* No ordered extent in the range */ entry = NULL; out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first_range(inode, entry); + } + spin_unlock_irq(&tree->lock); return entry; } @@ -1055,6 +1071,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret = 0; + trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + spin_lock_irq(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ecad67a2c745..87792f85e2c4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -180,13 +180,14 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); + void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate); + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a5b623ee6fac..2feb5c20641a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -63,137 +63,6 @@ struct sector_ptr { unsigned int uptodate:8; }; -enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE, - BTRFS_RBIO_READ_REBUILD, - BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, -}; - -struct btrfs_raid_bio { - struct btrfs_io_context *bioc; - - /* while we're doing rmw on a stripe - * we put it into a hash table so we can - * lock the stripe and merge more rbios - * into it. - */ - struct list_head hash_list; - - /* - * LRU list for the stripe cache - */ - struct list_head stripe_cache; - - /* - * for scheduling work in the helper threads - */ - struct work_struct work; - - /* - * bio list and bio_list_lock are used - * to add more bios into the stripe - * in hopes of avoiding the full rmw - */ - struct bio_list bio_list; - spinlock_t bio_list_lock; - - /* also protected by the bio_list_lock, the - * plug list is used by the plugging code - * to collect partial bios while plugged. The - * stripe locking code also uses it to hand off - * the stripe lock to the next pending IO - */ - struct list_head plug_list; - - /* - * flags that tell us if it is safe to - * merge with this bio - */ - unsigned long flags; - - /* - * set if we're doing a parity rebuild - * for a read from higher up, which is handled - * differently from a parity rebuild as part of - * rmw - */ - enum btrfs_rbio_ops operation; - - /* Size of each individual stripe on disk */ - u32 stripe_len; - - /* How many pages there are for the full stripe including P/Q */ - u16 nr_pages; - - /* How many sectors there are for the full stripe including P/Q */ - u16 nr_sectors; - - /* Number of data stripes (no p/q) */ - u8 nr_data; - - /* Numer of all stripes (including P/Q) */ - u8 real_stripes; - - /* How many pages there are for each stripe */ - u8 stripe_npages; - - /* How many sectors there are for each stripe */ - u8 stripe_nsectors; - - /* First bad stripe, -1 means no corruption */ - s8 faila; - - /* Second bad stripe (for RAID6 use) */ - s8 failb; - - /* Stripe number that we're scrubbing */ - u8 scrubp; - - /* - * size of all the bios in the bio_list. This - * helps us decide if the rbio maps to a full - * stripe or not - */ - int bio_list_bytes; - - int generic_bio_cnt; - - refcount_t refs; - - atomic_t stripes_pending; - - atomic_t error; - /* - * these are two arrays of pointers. We allocate the - * rbio big enough to hold them both and setup their - * locations when the rbio is allocated - */ - - /* pointers to pages that we allocated for - * reading/writing stripes directly from the disk (including P/Q) - */ - struct page **stripe_pages; - - /* Pointers to the sectors in the bio_list, for faster lookup */ - struct sector_ptr *bio_sectors; - - /* - * For subpage support, we need to map each sector to above - * stripe_pages. - */ - struct sector_ptr *stripe_sectors; - - /* Bitmap to record which horizontal stripe has data */ - unsigned long *dbitmap; - - /* allocated with real_stripes-many pointers for finish_*() calls */ - void **finish_pointers; - - /* Allocated with stripe_nsectors-many bits for finish_*() calls */ - unsigned long *finish_pbitmap; -}; - static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_work(struct work_struct *work); @@ -347,6 +216,24 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) } } +static void steal_rbio_page(struct btrfs_raid_bio *src, + struct btrfs_raid_bio *dest, int page_nr) +{ + const u32 sectorsize = src->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + if (dest->stripe_pages[page_nr]) + __free_page(dest->stripe_pages[page_nr]); + dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; + src->stripe_pages[page_nr] = NULL; + + /* Also update the sector->uptodate bits. */ + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; i++) + dest->stripe_sectors[i].uptodate = true; +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -358,7 +245,6 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; struct page *s; - struct page *d; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; @@ -368,12 +254,7 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) if (!s || !full_page_sectors_uptodate(src, i)) continue; - d = dest->stripe_pages[i]; - if (d) - __free_page(d); - - dest->stripe_pages[i] = s; - src->stripe_pages[i] = NULL; + steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); index_stripe_sectors(src); @@ -391,6 +272,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, + dest->stripe_nsectors); dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -590,9 +474,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio) int ret = 1; spin_lock_irqsave(&rbio->bio_list_lock, flags); - if (size != rbio->nr_data * rbio->stripe_len) + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; - BUG_ON(size > rbio->nr_data * rbio->stripe_len); + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); spin_unlock_irqrestore(&rbio->bio_list_lock, flags); return ret; @@ -932,6 +816,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -1023,29 +913,30 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_io_context *bioc, - u32 stripe_len) + struct btrfs_io_context *bioc) { const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; - const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; + const unsigned int stripe_nsectors = + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - int nr_data = 0; void *p; - ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * Our current stripe len should be fixed to 64k thus stripe_nsectors + * (at most 16) should be no larger than BITS_PER_LONG. + */ + ASSERT(stripe_nsectors <= BITS_PER_LONG); rbio = kzalloc(sizeof(*rbio) + sizeof(*rbio->stripe_pages) * num_pages + sizeof(*rbio->bio_sectors) * num_sectors + sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + - sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), + sizeof(*rbio->finish_pointers) * real_stripes, GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1056,7 +947,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bioc = bioc; - rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; @@ -1081,18 +971,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->bio_sectors, num_sectors); CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); #undef CONSUME_ALLOC - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - nr_data = real_stripes - 1; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - nr_data = real_stripes - 2; - else - BUG(); + ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); + rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); - rbio->nr_data = nr_data; return rbio; } @@ -1135,8 +1018,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector, unsigned int stripe_nr, unsigned int sector_nr, - unsigned long bio_max_len, - unsigned int opf) + enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; @@ -1180,8 +1062,9 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), - opf, GFP_NOFS); + bio = bio_alloc(stripe->dev->bdev, + max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), + op, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> 9; bio->bi_private = rbio; @@ -1215,9 +1098,6 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->raid_map[0]; - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_bio(bio)->iter; - bio_for_each_segment(bvec, bio, iter) { u32 bvec_offset; @@ -1252,6 +1132,34 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_unlock_irq(&rbio->bio_list_lock); } +static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + struct raid56_bio_trace_info *trace_info) +{ + const struct btrfs_io_context *bioc = rbio->bioc; + int i; + + ASSERT(bioc); + + /* We rely on bio->bi_bdev to find the stripe number. */ + if (!bio->bi_bdev) + goto not_found; + + for (i = 0; i < bioc->num_stripes; i++) { + if (bio->bi_bdev != bioc->stripes[i].dev->bdev) + continue; + trace_info->stripe_nr = i; + trace_info->devid = bioc->stripes[i].dev->devid; + trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + bioc->stripes[i].physical; + return; + } + +not_found: + trace_info->devid = -1; + trace_info->offset = -1; + trace_info->stripe_nr = -1; +} + /* * this is called from one of two situations. We either * have a full stripe from the higher layers, or we've read all @@ -1266,7 +1174,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; + /* The total sector number inside the full stripe. */ + int total_sector_nr; int stripe; + /* Sector number inside a stripe. */ int sectornr; bool has_qstripe; struct bio_list bio_list; @@ -1282,6 +1193,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else BUG(); + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + /* at this point we either have a full stripe, * or we've read the full stripe from the drive. * recalculate the parity and write the new results. @@ -1348,55 +1262,71 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } /* - * time to start writing. Make bios for everything from the - * higher layers (the bio_list in our rbio) and our p/q. Ignore - * everything else. + * Start writing. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, - REQ_OP_WRITE); - if (ret) - goto cleanup; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; } if (likely(!bioc->num_tgtdevs)) goto write_data; - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bioc->tgtdev_map[stripe]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } + if (!bioc->tgtdev_map[stripe]) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; + continue; + } - ret = rbio_add_io_sector(rbio, &bio_list, sector, - rbio->bioc->tgtdev_map[stripe], - sectornr, rbio->stripe_len, - REQ_OP_WRITE); - if (ret) - goto cleanup; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, + rbio->bioc->tgtdev_map[stripe], + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; } write_data: @@ -1406,6 +1336,12 @@ write_data: while ((bio = bio_list_pop(&bio_list))) { bio->bi_end_io = raid_write_end_io; + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -1433,7 +1369,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bioc->num_stripes; i++) { stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, rbio->stripe_len) && + if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } @@ -1455,7 +1391,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->nr_data; i++) { u64 stripe_start = rbio->bioc->raid_map[i]; - if (in_range(logical, stripe_start, rbio->stripe_len)) + if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) return i; } return -1; @@ -1552,15 +1488,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid_rmw_end_io(struct bio *bio) +static void raid56_bio_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1571,23 +1499,34 @@ static void raid_rmw_end_io(struct bio *bio) bio_put(bio); - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + if (atomic_dec_and_test(&rbio->stripes_pending)) + queue_work(rbio->bioc->fs_info->endio_raid56_workers, + &rbio->end_io_work); +} - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; +/* + * End io handler for the read phase of the RMW cycle. All the bios here are + * physical stripe bios we've read from the disk so we can recalculate the + * parity of the stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_rmw_end_io_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); + + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + rbio_orig_end_io(rbio, BLK_STS_IOERR); + return; + } /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write but if there + * are any failed stripes we'll reconstruct from parity first. */ validate_rbio_for_rmw(rbio); - return; - -cleanup: - - rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1598,9 +1537,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; struct bio_list bio_list; + const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -1612,38 +1551,34 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + /* Build a list of bios to read all the missing data sectors. */ + for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - /* - * We want to find all the sectors missing from the - * rbio and read them from the disk. If * sector_in_rbio() - * finds a page in the bio list we don't need to read - * it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a page + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. - * If so, be happy and use it. - */ - if (sector->uptodate) - continue; + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate page. If so, + * use it. + */ + if (sector->uptodate) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret) - goto cleanup; - } + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -1662,11 +1597,16 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_rmw_end_io; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_read_partial_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_read_partial(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -1833,27 +1773,53 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) run_plug(plug); } +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + fs_info->sectorsize_bits) % rbio->stripe_nsectors; + + set_bit(bit, &rbio->dbitmap); + } +} + /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret; + int ret = 0; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + ret = PTR_ERR(rbio); + goto out_dec_counter; } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); - btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; /* @@ -1863,8 +1829,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); - return ret; + goto out_dec_counter; + return; } cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); @@ -1875,13 +1841,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - ret = 0; } else { ret = __raid56_parity_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); + goto out_dec_counter; } - return ret; + + return; + +out_dec_counter: + btrfs_bio_counter_dec(fs_info); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); } /* @@ -1939,7 +1910,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, rbio->dbitmap)) + !test_bit(sectornr, &rbio->dbitmap)) continue; /* @@ -2108,25 +2079,13 @@ cleanup_io: } /* - * This is called only for stripes we've read from disk to - * reconstruct the parity. + * This is called only for stripes we've read from disk to reconstruct the + * parity. */ -static void raid_recover_end_io(struct bio *bio) +static void raid_recover_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - /* - * we only read stripe pages off the disk, set them - * up to date if there were no errors - */ - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); @@ -2147,8 +2106,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2160,33 +2118,31 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) atomic_set(&rbio->error, 0); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + struct sector_ptr *sector; + if (rbio->faila == stripe || rbio->failb == stripe) { atomic_inc(&rbio->error); + /* Skip the current stripe. */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; continue; } - - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* - * the rmw code may have already read this - * page in - */ - sector = rbio_stripe_sector(rbio, stripe, sectornr); - if (sector->uptodate) - continue; - - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret < 0) - goto cleanup; - } + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret < 0) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -2209,11 +2165,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_recover_end_io; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_recover_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + } submit_bio(bio); } @@ -2236,28 +2197,27 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, int mirror_num, int generic_io) +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - int ret; if (generic_io) { ASSERT(bioc->mirror_num == mirror_num); btrfs_bio(bio)->mirror_num = mirror_num; + } else { + btrfs_get_bioc(bioc); } - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - if (generic_io) - btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + goto out_end_bio; } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio_add_bio(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { @@ -2265,18 +2225,13 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - if (generic_io) - btrfs_put_bioc(bioc); kfree(rbio); - return -EIO; + bio->bi_status = BLK_STS_IOERR; + goto out_end_bio; } - if (generic_io) { - btrfs_bio_counter_inc_noblocked(fs_info); + if (generic_io) rbio->generic_bio_cnt = 1; - } else { - btrfs_get_bioc(bioc); - } /* * Loop retry: @@ -2295,24 +2250,20 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->failb--; } - ret = lock_stripe_add(rbio); + if (lock_stripe_add(rbio)) + return; /* - * __raid56_parity_recover will end the bio with - * any errors it hits. We don't want to return - * its error value up the stack because our caller - * will end up calling bio_endio with any nonzero - * return - */ - if (ret == 0) - __raid56_parity_recover(rbio); - /* - * our rbio has been added to the list of - * rbios that will be handled after the - * currently lock owner is done + * This adds our rbio to the list of rbios that will be handled after + * the current lock owner is done. */ - return 0; + __raid56_parity_recover(rbio); + return; +out_end_bio: + btrfs_bio_counter_dec(fs_info); + btrfs_put_bioc(bioc); + bio_endio(bio); } static void rmw_work(struct work_struct *work) @@ -2343,14 +2294,14 @@ static void read_rebuild_work(struct work_struct *work) struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2374,7 +2325,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, } ASSERT(i < rbio->real_stripes); - bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); /* * We have already increased bio_counter when getting bioc, record it @@ -2395,7 +2346,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, ASSERT(logical >= rbio->bioc->raid_map[0]); ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + - rbio->stripe_len * rbio->nr_data); + BTRFS_STRIPE_LEN * rbio->nr_data); stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); index = stripe_offset / sectorsize; rbio->bio_sectors[index].page = page; @@ -2409,23 +2360,22 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int stripe; - int sectornr; - - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - struct page *page; - int index = (stripe * rbio->stripe_nsectors + sectornr) * - sectorsize >> PAGE_SHIFT; + int total_sector_nr; - if (rbio->stripe_pages[index]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct page *page; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; - } + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (rbio->stripe_pages[index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; } index_stripe_sectors(rbio); return 0; @@ -2437,7 +2387,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - unsigned long *pbitmap = rbio->finish_pbitmap; + unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; int sectornr; @@ -2460,7 +2410,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2497,7 +2447,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; void *parity; @@ -2525,7 +2475,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, sectornr, 1); + bitmap_clear(&rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) @@ -2547,12 +2497,12 @@ writeback: * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, - sectornr, rbio->stripe_len, REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2566,7 +2516,7 @@ writeback: sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, bioc->tgtdev_map[rbio->scrubp], - sectornr, rbio->stripe_len, REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2584,6 +2534,12 @@ submit_write: while ((bio = bio_list_pop(&bio_list))) { bio->bi_end_io = raid_write_end_io; + if (trace_raid56_scrub_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -2671,24 +2627,14 @@ cleanup: * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio) +static void raid56_parity_scrub_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write, but if there + * are any failed stripes we'll reconstruct from parity first */ validate_rbio_for_parity_scrub(rbio); } @@ -2698,8 +2644,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2709,37 +2654,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) goto cleanup; atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; - /* - * We want to find all the sectors missing from the - * rbio and read them from the disk. If * sector_in_rbio() - * finds a sector in the bio list we don't need to read - * it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int stripe = total_sector_nr / rbio->stripe_nsectors; + struct sector_ptr *sector; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate sector. - * If so, be happy and use it. - */ - if (sector->uptodate) - continue; + /* No data in the vertical stripe, no need to read. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret) - goto cleanup; - } + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a sector + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; + + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate sector. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -2758,11 +2704,16 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_parity_scrub_end_io; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -2797,13 +2748,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bioc, length); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index aaad08aefd7d..6f48f9e4c869 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,45 +7,179 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H -static inline int nr_parity_stripes(const struct map_lookup *map) -{ - if (map->type & BTRFS_BLOCK_GROUP_RAID5) - return 1; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - return 2; - else - return 0; -} +#include <linux/workqueue.h> +#include "volumes.h" + +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, +}; + +struct btrfs_raid_bio { + struct btrfs_io_context *bioc; + + /* + * While we're doing RMW on a stripe we put it into a hash table so we + * can lock the stripe and merge more rbios into it. + */ + struct list_head hash_list; + + /* LRU list for the stripe cache */ + struct list_head stripe_cache; + + /* For scheduling work in the helper threads */ + struct work_struct work; + + /* + * bio_list and bio_list_lock are used to add more bios into the stripe + * in hopes of avoiding the full RMW + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* + * Also protected by the bio_list_lock, the plug list is used by the + * plugging code to collect partial bios while plugged. The stripe + * locking code also uses it to hand off the stripe lock to the next + * pending IO. + */ + struct list_head plug_list; + + /* Flags that tell us if it is safe to merge with this bio. */ + unsigned long flags; + + /* + * Set if we're doing a parity rebuild for a read from higher up, which + * is handled differently from a parity rebuild as part of RMW. + */ + enum btrfs_rbio_ops operation; + + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; + + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Numer of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* First bad stripe, -1 means no corruption */ + s8 faila; + + /* Second bad stripe (for RAID6 use) */ + s8 failb; + + /* Stripe number that we're scrubbing */ + u8 scrubp; + + /* + * Size of all the bios in the bio_list. This helps us decide if the + * rbio maps to a full stripe or not. + */ + int bio_list_bytes; + + int generic_bio_cnt; + + refcount_t refs; + + atomic_t stripes_pending; + + atomic_t error; + + struct work_struct end_io_work; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + + /* + * These are two arrays of pointers. We allocate the rbio big enough + * to hold them both and setup their locations when the rbio is + * allocated. + */ + + /* + * Pointers to pages that we allocated for reading/writing stripes + * directly from the disk (including P/Q). + */ + struct page **stripe_pages; + + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; + + /* + * For subpage support, we need to map each sector to above + * stripe_pages. + */ + struct sector_ptr *stripe_sectors; + + /* Allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; +}; + +/* + * For trace event usage only. Records useful debug info for each bio submitted + * by RAID56 to each physical device. + * + * No matter signed or not, (-1) is always the one indicating we can not grab + * the proper stripe number. + */ +struct raid56_bio_trace_info { + u64 devid; + + /* The offset inside the stripe. (<= STRIPE_LEN) */ + u32 offset; + + /* + * Stripe number. + * 0 is the first data stripe, and nr_data for P stripe, + * nr_data + 1 for Q stripe. + * >= real_stripes for + */ + u8 stripe_nr; +}; static inline int nr_data_stripes(const struct map_lookup *map) { - return map->num_stripes - nr_parity_stripes(map); + return map->num_stripes - btrfs_nr_parity_stripes(map->type); } + #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) -struct btrfs_raid_bio; struct btrfs_device; -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len); +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io); +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, unsigned int pgoff, u64 logical); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, - struct btrfs_io_context *bioc, u32 stripe_len, + struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index c39f8b3a5a4a..9acf47b11fe6 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -5,6 +5,7 @@ #include "compression.h" #include "ctree.h" #include "delalloc-space.h" +#include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" @@ -22,8 +23,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, int ret; inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); + if (!no_time_update) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -110,7 +113,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (comp_type == BTRFS_COMPRESS_NONE) { memcpy_to_page(page, offset_in_page(file_offset), data_start, datal); - flush_dcache_page(page); } else { ret = btrfs_decompress(comp_type, data_start, page, offset_in_page(file_offset), @@ -132,10 +134,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * * So what's in the range [500, 4095] corresponds to zeroes. */ - if (datal < block_size) { + if (datal < block_size) memzero_page(page, datal, block_size - datal); - flush_dcache_page(page); - } btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); btrfs_page_clear_checked(fs_info, page, file_offset, block_size); @@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; + u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { - u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; u64 extent_gen; int type; @@ -431,14 +431,21 @@ process_slot: * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. */ - if (key.offset + datal <= off) { + if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } - next_key_min_offset = key.offset + datal; + + prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); @@ -489,6 +496,7 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); @@ -550,7 +558,7 @@ process_slot: break; btrfs_release_path(path); - key.offset = next_key_min_offset; + key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; @@ -650,7 +658,8 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + const u64 bs = fs_info->sb->s_blocksize; int ret; /* @@ -661,6 +670,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_btree_balance_dirty(fs_info); + return ret; } @@ -770,6 +781,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); + btrfs_btree_balance_dirty(fs_info); + return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e7b0323e6efd..3afe5fa50a63 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -135,15 +135,13 @@ struct scrub_parity { struct work_struct work; /* Mark the parity blocks which have data */ - unsigned long *dbitmap; + unsigned long dbitmap; /* * Mark the parity blocks which have data, but errors happen when * read data or check data */ - unsigned long *ebitmap; - - unsigned long bitmap[]; + unsigned long ebitmap; }; struct scrub_ctx { @@ -1218,7 +1216,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, u64 *raid_map, - u64 mapped_length, int nstripes, int mirror, int *stripe_index, u64 *stripe_offset) @@ -1233,7 +1230,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, continue; if (logical >= raid_map[i] && - logical < raid_map[i] + mapped_length) + logical < raid_map[i] + BTRFS_STRIPE_LEN) break; } @@ -1337,7 +1334,6 @@ leave_nomem: scrub_stripe_index_and_offset(logical, bioc->map_type, bioc->raid_map, - mapped_length, bioc->num_stripes - bioc->num_tgtdevs, mirror_index, @@ -1380,19 +1376,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct scrub_sector *sector) { DECLARE_COMPLETION_ONSTACK(done); - int ret; - int mirror_num; bio->bi_iter.bi_sector = sector->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - - mirror_num = sector->sblock->sectors[0]->mirror_num; - ret = raid56_parity_recover(bio, sector->recover->bioc, - sector->recover->map_length, - mirror_num, 0); - if (ret) - return ret; + raid56_parity_recover(bio, sector->recover->bioc, + sector->sblock->sectors[0]->mirror_num, false); wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); @@ -2197,7 +2186,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(bio, bioc, length); + rbio = raid56_alloc_missing_rbio(bio, bioc); if (!rbio) goto rbio_out; @@ -2406,13 +2395,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); } static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); } static void scrub_block_complete(struct scrub_block *sblock) @@ -2763,7 +2752,7 @@ static void scrub_free_parity(struct scrub_parity *sparity) struct scrub_sector *curr, *next; int nbits; - nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); if (nbits) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors += nbits; @@ -2795,8 +2784,8 @@ static void scrub_parity_bio_endio(struct bio *bio) struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; if (bio->bi_status) - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, - sparity->nsectors); + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, + &sparity->dbitmap, sparity->nsectors); bio_put(bio); @@ -2814,8 +2803,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) u64 length; int ret; - if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, - sparity->nsectors)) + if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, + &sparity->ebitmap, sparity->nsectors)) goto out; length = sparity->logic_end - sparity->logic_start; @@ -2831,9 +2820,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, sparity->scrub_dev, - sparity->dbitmap, + &sparity->dbitmap, sparity->nsectors); if (!rbio) goto rbio_out; @@ -2847,7 +2836,7 @@ rbio_out: bioc_out: btrfs_bio_counter_dec(fs_info); btrfs_put_bioc(bioc); - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2856,11 +2845,6 @@ out: scrub_free_parity(sparity); } -static inline int scrub_calc_parity_bitmap_len(int nsectors) -{ - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); -} - static void scrub_parity_get(struct scrub_parity *sparity) { refcount_inc(&sparity->refs); @@ -3131,7 +3115,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int ret; struct scrub_parity *sparity; int nsectors; - int bitmap_len; path = btrfs_alloc_path(); if (!path) { @@ -3145,9 +3128,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; - bitmap_len = scrub_calc_parity_bitmap_len(nsectors); - sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, - GFP_NOFS); + ASSERT(nsectors <= BITS_PER_LONG); + sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); if (!sparity) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -3165,8 +3147,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->logic_end = logic_end; refcount_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->sectors_list); - sparity->dbitmap = sparity->bitmap; - sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; ret = 0; for (cur_logical = logic_start; cur_logical < logic_end; @@ -3429,20 +3409,22 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct extent_map *em, struct btrfs_device *scrub_dev, - int stripe_index, u64 dev_extent_len) + int stripe_index) { struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root; struct btrfs_root *csum_root; struct blk_plug plug; + struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; u64 physical = map->stripes[stripe_index].physical; - const u64 physical_end = physical + dev_extent_len; + const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; /* The logical increment after finishing one stripe */ @@ -3569,8 +3551,8 @@ next: physical += map->stripe_len; spin_lock(&sctx->stat_lock); if (stop_loop) - sctx->stat.last_physical = map->stripes[stripe_index].physical + - dev_extent_len; + sctx->stat.last_physical = + map->stripes[stripe_index].physical + dev_stripe_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); @@ -3639,8 +3621,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, map, scrub_dev, i, - dev_extent_len); + ret = scrub_stripe(sctx, bg, em, scrub_dev, i); if (ret) goto out; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fa56890ff81f..e7671afcee4f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -10,12 +10,14 @@ #include <linux/mount.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> +#include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/compat.h> #include <linux/crc32c.h> #include "send.h" +#include "ctree.h" #include "backref.h" #include "locking.h" #include "disk-io.h" @@ -81,8 +83,12 @@ struct send_ctx { char *send_buf; u32 send_size; u32 send_max_size; - u64 total_send_size; - u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + /* + * Whether BTRFS_SEND_A_DATA attribute was already added to current + * command (since protocol v2, data must be the last attribute). + */ + bool put_data; + struct page **send_buf_pages; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */ u32 proto; @@ -112,14 +118,14 @@ struct send_ctx { */ u64 cur_ino; u64 cur_inode_gen; - int cur_inode_new; - int cur_inode_new_gen; - int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + bool cur_inode_new; + bool cur_inode_new_gen; + bool cur_inode_deleted; bool ignore_cur_inode; u64 send_progress; @@ -127,7 +133,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct xarray name_cache; + struct radix_tree_root name_cache; struct list_head name_cache_list; int name_cache_size; @@ -234,6 +240,9 @@ struct send_ctx { * Indexed by the inode number of the directory to be deleted. */ struct rb_root orphan_dirs; + + struct rb_root rbtree_new_refs; + struct rb_root rbtree_deleted_refs; }; struct pending_dir_move { @@ -268,13 +277,14 @@ struct orphan_dir_info { struct name_cache_entry { struct list_head list; /* - * On 32bit kernels, xarray has only 32bit indices, but we need to - * handle 64bit inums. We use the lower 32bit of the 64bit inum to store - * it in the tree. If more than one inum would fall into the same entry, - * we use inum_aliases to store the additional entries. inum_aliases is - * also used to store entries with the same inum but different generations. + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. */ - struct list_head inum_aliases; + struct list_head radix_list; u64 ino; u64 gen; u64 parent_ino; @@ -333,8 +343,8 @@ __maybe_unused static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) { switch (sctx->proto) { - case 1: return cmd < __BTRFS_SEND_C_MAX_V1; - case 2: return cmd < __BTRFS_SEND_C_MAX_V2; + case 1: return cmd <= BTRFS_SEND_C_MAX_V1; + case 2: return cmd <= BTRFS_SEND_C_MAX_V2; default: return false; } } @@ -575,15 +585,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) while (pos < len) { ret = kernel_write(filp, buf + pos, len - pos, off); - /* TODO handle that correctly */ - /*if (ret == -ERESTARTSYS) { - continue; - }*/ if (ret < 0) return ret; - if (ret == 0) { + if (ret == 0) return -EIO; - } pos += ret; } @@ -596,6 +601,9 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) int total_len = sizeof(*hdr) + len; int left = sctx->send_max_size - sctx->send_size; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + if (unlikely(left < total_len)) return -EOVERFLOW; @@ -616,6 +624,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -691,8 +700,7 @@ static int send_header(struct send_ctx *sctx) struct btrfs_stream_header hdr; strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); - hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); - + hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); } @@ -732,9 +740,8 @@ static int send_cmd(struct send_ctx *sctx) ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); - sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; + sctx->put_data = false; return ret; } @@ -840,7 +847,7 @@ out: */ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, - u64 *gid, u64 *rdev) + u64 *gid, u64 *rdev, u64 *fileattr) { int ret; struct btrfs_inode_item *ii; @@ -870,6 +877,12 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, *gid = btrfs_inode_gid(path->nodes[0], ii); if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); + /* + * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's + * otherwise logically split to 32/32 parts. + */ + if (fileattr) + *fileattr = btrfs_inode_flags(path->nodes[0], ii); return ret; } @@ -877,7 +890,7 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, static int get_inode_info(struct btrfs_root *root, u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) + u64 *rdev, u64 *fileattr) { struct btrfs_path *path; int ret; @@ -886,7 +899,7 @@ static int get_inode_info(struct btrfs_root *root, if (!path) return -ENOMEM; ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, - rdev); + rdev, fileattr); btrfs_free_path(path); return ret; } @@ -1632,7 +1645,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) u64 right_gen; ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; left_ret = ret; @@ -1641,7 +1654,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) right_ret = -ENOENT; } else { ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; right_ret = ret; @@ -1804,7 +1817,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, if (dir_gen) { ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0) goto out; } @@ -1876,7 +1889,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, */ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1904,7 +1917,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, who_mode, NULL, NULL, NULL); + who_gen, who_mode, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -1943,7 +1956,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, if (dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1966,7 +1979,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, } ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret < 0) goto out; @@ -2024,9 +2037,9 @@ out: } /* - * Insert a name cache entry. On 32bit kernels the xarray index is 32bit, + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::inum_aliases. + * takes care of this with the help of name_cache_entry::radix_list. * In case of error, nce is kfreed. */ static int name_cache_insert(struct send_ctx *sctx, @@ -2035,7 +2048,8 @@ static int name_cache_insert(struct send_ctx *sctx, int ret = 0; struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); if (!nce_head) { @@ -2044,14 +2058,14 @@ static int name_cache_insert(struct send_ctx *sctx, } INIT_LIST_HEAD(nce_head); - ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL); + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); if (ret < 0) { kfree(nce_head); kfree(nce); return ret; } } - list_add_tail(&nce->inum_aliases, nce_head); + list_add_tail(&nce->radix_list, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -2063,14 +2077,15 @@ static void name_cache_delete(struct send_ctx *sctx, { struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { btrfs_err(sctx->send_root->fs_info, "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", nce->ino, sctx->name_cache_size); } - list_del(&nce->inum_aliases); + list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; @@ -2078,7 +2093,7 @@ static void name_cache_delete(struct send_ctx *sctx, * We may not get to the final release of nce_head if the lookup fails */ if (nce_head && list_empty(nce_head)) { - xa_erase(&sctx->name_cache, (unsigned long)nce->ino); + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } } @@ -2089,11 +2104,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, struct list_head *nce_head; struct name_cache_entry *cur; - nce_head = xa_load(&sctx->name_cache, (unsigned long)ino); + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); if (!nce_head) return NULL; - list_for_each_entry(cur, nce_head, inum_aliases) { + list_for_each_entry(cur, nce_head, radix_list) { if (cur->ino == ino && cur->gen == gen) return cur; } @@ -2180,7 +2195,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in - * __record_new_ref + * record_new_ref_if_needed(). */ ret = is_inode_existent(sctx, ino, gen); if (ret < 0) @@ -2495,6 +2510,39 @@ out: return ret; } +static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + if (sctx->proto < 2) + return 0; + + btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; @@ -2574,7 +2622,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); - /* TODO Add otime support when the otime patches get into upstream */ + if (sctx->proto >= 2) + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); ret = send_cmd(sctx); @@ -2608,7 +2657,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) if (ino != sctx->cur_ino) { ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, - NULL, NULL, &rdev); + NULL, NULL, &rdev, NULL); if (ret < 0) goto out; } else { @@ -2747,48 +2796,50 @@ struct recorded_ref { u64 dir; u64 dir_gen; int name_len; + struct rb_node node; + struct rb_root *root; }; -static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +static struct recorded_ref *recorded_ref_alloc(void) { - ref->full_path = path; - ref->name = (char *)kbasename(ref->full_path->start); - ref->name_len = ref->full_path->end - ref->name; + struct recorded_ref *ref; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + return NULL; + RB_CLEAR_NODE(&ref->node); + INIT_LIST_HEAD(&ref->list); + return ref; } -/* - * We need to process new refs before deleted refs, but compare_tree gives us - * everything mixed. So we first record all refs and later process them. - * This function is a helper to record one ref. - */ -static int __record_ref(struct list_head *head, u64 dir, - u64 dir_gen, struct fs_path *path) +static void recorded_ref_free(struct recorded_ref *ref) { - struct recorded_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_KERNEL); if (!ref) - return -ENOMEM; + return; + if (!RB_EMPTY_NODE(&ref->node)) + rb_erase(&ref->node, ref->root); + list_del(&ref->list); + fs_path_free(ref->full_path); + kfree(ref); +} - ref->dir = dir; - ref->dir_gen = dir_gen; - set_ref_path(ref, path); - list_add_tail(&ref->list, head); - return 0; +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; } static int dup_ref(struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *new; - new = kmalloc(sizeof(*ref), GFP_KERNEL); + new = recorded_ref_alloc(); if (!new) return -ENOMEM; new->dir = ref->dir; new->dir_gen = ref->dir_gen; - new->full_path = NULL; - INIT_LIST_HEAD(&new->list); list_add_tail(&new->list, list); return 0; } @@ -2799,9 +2850,7 @@ static void __free_recorded_refs(struct list_head *head) while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(cur->full_path); - list_del(&cur->list); - kfree(cur); + recorded_ref_free(cur); } } @@ -3311,7 +3360,7 @@ finish: * The parent inode might have been deleted in the send snapshot */ ret = get_inode_info(sctx->send_root, cur->dir, NULL, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); if (ret == -ENOENT) { ret = 0; continue; @@ -3486,11 +3535,11 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, } ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, - &left_gen, NULL, NULL, NULL, NULL); + &left_gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, - &right_gen, NULL, NULL, NULL, NULL); + &right_gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -3621,7 +3670,7 @@ static int is_ancestor(struct btrfs_root *root, } ret = get_inode_info(root, parent, NULL, &parent_gen, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = check_ino_in_path(root, ino1, ino1_gen, @@ -3713,7 +3762,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, ret = get_inode_info(sctx->parent_root, ino, NULL, &parent_ino_gen, NULL, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; if (ino_gen == parent_ino_gen) { @@ -4307,185 +4356,169 @@ out: return ret; } -static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, - void *ctx, struct list_head *refs) +static int rbtree_ref_comp(const void *k, const struct rb_node *node) +{ + const struct recorded_ref *data = k; + const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + int result; + + if (data->dir > ref->dir) + return 1; + if (data->dir < ref->dir) + return -1; + if (data->dir_gen > ref->dir_gen) + return 1; + if (data->dir_gen < ref->dir_gen) + return -1; + if (data->name_len > ref->name_len) + return 1; + if (data->name_len < ref->name_len) + return -1; + result = strcmp(data->name, ref->name); + if (result > 0) + return 1; + if (result < 0) + return -1; + return 0; +} + +static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + + return rbtree_ref_comp(entry, parent) < 0; +} + +static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, + struct fs_path *name, u64 dir, u64 dir_gen, + struct send_ctx *sctx) { int ret = 0; - struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; + struct fs_path *path = NULL; + struct recorded_ref *ref = NULL; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + path = fs_path_alloc(); + if (!path) { + ret = -ENOMEM; + goto out; + } - ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); - if (ret < 0) + ref = recorded_ref_alloc(); + if (!ref) { + ret = -ENOMEM; goto out; + } - ret = get_cur_path(sctx, dir, gen, p); + ret = get_cur_path(sctx, dir, dir_gen, path); if (ret < 0) goto out; - ret = fs_path_add_path(p, name); + ret = fs_path_add_path(path, name); if (ret < 0) goto out; - ret = __record_ref(refs, dir, gen, p); - + ref->dir = dir; + ref->dir_gen = dir_gen; + set_ref_path(ref, path); + list_add_tail(&ref->list, refs); + rb_add(&ref->node, root, rbtree_ref_less); + ref->root = root; out: - if (ret) - fs_path_free(p); + if (ret) { + if (path && (!ref || !ref->full_path)) + fs_path_free(path); + recorded_ref_free(ref); + } return ret; } -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - struct send_ctx *sctx = ctx; - return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); -} - - -static int __record_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_new_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) { + int ret = 0; struct send_ctx *sctx = ctx; - return record_ref(sctx->parent_root, dir, name, ctx, - &sctx->deleted_refs); -} - -static int record_new_ref(struct send_ctx *sctx) -{ - int ret; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL, NULL); if (ret < 0) goto out; - ret = 0; + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_new_refs, + &sctx->new_refs, name, dir, dir_gen, + sctx); + } out: return ret; } -static int record_deleted_ref(struct send_ctx *sctx) +static int record_deleted_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) { - int ret; + int ret = 0; + struct send_ctx *sctx = ctx; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL, NULL); if (ret < 0) goto out; - ret = 0; + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, + &sctx->deleted_refs, name, dir, + dir_gen, sctx); + } out: return ret; } -struct find_ref_ctx { - u64 dir; - u64 dir_gen; - struct btrfs_root *root; - struct fs_path *name; - int found_idx; -}; - -static int __find_iref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx_) -{ - struct find_ref_ctx *ctx = ctx_; - u64 dir_gen; - int ret; - - if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && - strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { - /* - * To avoid doing extra lookups we'll only do this if everything - * else matches. - */ - ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - if (dir_gen != ctx->dir_gen) - return 0; - ctx->found_idx = num; - return 1; - } - return 0; -} - -static int find_iref(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 dir, u64 dir_gen, struct fs_path *name) +static int record_new_ref(struct send_ctx *sctx) { int ret; - struct find_ref_ctx ctx; - - ctx.dir = dir; - ctx.name = name; - ctx.dir_gen = dir_gen; - ctx.found_idx = -1; - ctx.root = root; - ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - return ret; - - if (ctx.found_idx == -1) - return -ENOENT; - - return ctx.found_idx; -} - -static int __record_changed_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - u64 dir_gen; - int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - - ret = find_iref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, dir, dir_gen, name); - if (ret == -ENOENT) - ret = __record_new_ref(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; + goto out; + ret = 0; +out: return ret; } -static int __record_changed_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_deleted_ref(struct send_ctx *sctx) { - u64 dir_gen; int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, - dir, dir_gen, name); - if (ret == -ENOENT) - ret = __record_deleted_ref(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, record_deleted_ref_if_needed, + sctx); + if (ret < 0) + goto out; + ret = 0; +out: return ret; } @@ -4494,11 +4527,11 @@ static int record_changed_ref(struct send_ctx *sctx) int ret = 0; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_changed_new_ref, sctx); + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) goto out; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); + sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) goto out; ret = 0; @@ -4529,10 +4562,10 @@ static int process_all_refs(struct send_ctx *sctx, if (cmd == BTRFS_COMPARE_TREE_NEW) { root = sctx->send_root; - cb = __record_new_ref; + cb = record_new_ref_if_needed; } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { root = sctx->parent_root; - cb = __record_deleted_ref; + cb = record_deleted_ref_if_needed; } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); @@ -4860,14 +4893,28 @@ static inline u64 max_send_read_size(const struct send_ctx *sctx) static int put_data_header(struct send_ctx *sctx, u32 len) { - struct btrfs_tlv_header *hdr; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + sctx->put_data = true; + if (sctx->proto >= 2) { + /* + * Since v2, the data attribute header doesn't include a length, + * it is implicitly to the end of the command. + */ + if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + return -EOVERFLOW; + put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); + sctx->send_size += sizeof(__le16); + } else { + struct btrfs_tlv_header *hdr; - if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) - return -EOVERFLOW; - hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); - put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); - put_unaligned_le16(len, &hdr->tlv_len); - sctx->send_size += sizeof(*hdr); + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + } return 0; } @@ -5010,7 +5057,7 @@ static int send_clone(struct send_ctx *sctx, if (clone_root->root == sctx->send_root) { ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL, NULL); + &gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); @@ -5137,17 +5184,214 @@ tlv_put_failure: return ret; } -static int send_extent_data(struct send_ctx *sctx, - const u64 offset, - const u64 len) +static int send_encoded_inline_extent(struct send_ctx *sctx, + struct btrfs_path *path, u64 offset, + u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 ram_bytes; + size_t inline_size; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + ram_bytes - offset, len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + + ret = put_data_header(sctx, inline_size); + if (ret < 0) + goto out; + read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, + btrfs_file_extent_inline_start(ei), inline_size); + sctx->send_size += inline_size; + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, + u64 offset, u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 disk_bytenr, disk_num_bytes; + u32 data_offset; + struct btrfs_cmd_header *hdr; + u32 crc; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, + len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, + btrfs_file_extent_ram_bytes(leaf, ei)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, + offset - key.offset + btrfs_file_extent_offset(leaf, ei)); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); + + ret = put_data_header(sctx, disk_num_bytes); + if (ret < 0) + goto out; + + /* + * We want to do I/O directly into the send buffer, so get the next page + * boundary in the send buffer. This means that there may be a gap + * between the beginning of the command and the file data. + */ + data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + if (data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes) { + ret = -EOVERFLOW; + goto out; + } + + /* + * Note that send_buf is a mapping of send_buf_pages, so this is really + * reading into send_buf. + */ + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + disk_bytenr, disk_num_bytes, + sctx->send_buf_pages + + (data_offset >> PAGE_SHIFT)); + if (ret) + goto out; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); + hdr->crc = 0; + crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + if (!ret) { + ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, + disk_num_bytes, &sctx->send_off); + } + sctx->send_size = 0; + sctx->put_data = false; + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, + const u64 offset, const u64 len) { const u64 end = offset + len; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { + bool is_inline = (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_INLINE); + + /* + * Send the compressed extent unless the compressed data is + * larger than the decompressed data. This can happen if we're + * not sending the entire extent, either because it has been + * partially overwritten/truncated or because this is a part of + * the extent that we couldn't clone in clone_range(). + */ + if (is_inline && + btrfs_file_extent_inline_item_len(leaf, + path->slots[0]) <= len) { + return send_encoded_inline_extent(sctx, path, offset, + len); + } else if (!is_inline && + btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { + return send_encoded_extent(sctx, path, offset, len); + } + } + if (sctx->cur_inode == NULL) { struct btrfs_root *root = sctx->send_root; @@ -5285,12 +5529,9 @@ out: return ret; } -static int clone_range(struct send_ctx *sctx, - struct clone_root *clone_root, - const u64 disk_byte, - u64 data_offset, - u64 offset, - u64 len) +static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, + struct clone_root *clone_root, const u64 disk_byte, + u64 data_offset, u64 offset, u64 len) { struct btrfs_path *path; struct btrfs_key key; @@ -5314,7 +5555,7 @@ static int clone_range(struct send_ctx *sctx, */ if (clone_root->offset == 0 && len == sctx->send_root->fs_info->sectorsize) - return send_extent_data(sctx, offset, len); + return send_extent_data(sctx, dst_path, offset, len); path = alloc_path_for_send(); if (!path) @@ -5325,7 +5566,8 @@ static int clone_range(struct send_ctx *sctx, * accept clones from these extents. */ ret = __get_inode_info(clone_root->root, path, clone_root->ino, - &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); + &clone_src_i_size, NULL, NULL, NULL, NULL, NULL, + NULL); btrfs_release_path(path); if (ret < 0) goto out; @@ -5411,7 +5653,8 @@ static int clone_range(struct send_ctx *sctx, if (hole_len > len) hole_len = len; - ret = send_extent_data(sctx, offset, hole_len); + ret = send_extent_data(sctx, dst_path, offset, + hole_len); if (ret < 0) goto out; @@ -5484,14 +5727,16 @@ static int clone_range(struct send_ctx *sctx, if (ret < 0) goto out; } - ret = send_extent_data(sctx, offset + slen, + ret = send_extent_data(sctx, dst_path, + offset + slen, clone_len - slen); } else { ret = send_clone(sctx, offset, clone_len, clone_root); } } else { - ret = send_extent_data(sctx, offset, clone_len); + ret = send_extent_data(sctx, dst_path, offset, + clone_len); } if (ret < 0) @@ -5523,7 +5768,7 @@ next: } if (len > 0) - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; out: @@ -5554,10 +5799,10 @@ static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); - ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, end - offset); + ret = clone_range(sctx, path, clone_root, disk_byte, + data_offset, offset, end - offset); } else { - ret = send_extent_data(sctx, offset, end - offset); + ret = send_extent_data(sctx, path, offset, end - offset); } sctx->cur_inode_next_write_offset = end; return ret; @@ -6017,11 +6262,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) u64 left_mode; u64 left_uid; u64 left_gid; + u64 left_fileattr; u64 right_mode; u64 right_uid; u64 right_gid; + u64 right_fileattr; int need_chmod = 0; int need_chown = 0; + bool need_fileattr = false; int need_truncate = 1; int pending_move = 0; int refs_processed = 0; @@ -6055,7 +6303,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid, NULL); + &left_mode, &left_uid, &left_gid, NULL, &left_fileattr); if (ret < 0) goto out; @@ -6070,7 +6318,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &old_size, NULL, &right_mode, &right_uid, - &right_gid, NULL); + &right_gid, NULL, &right_fileattr); if (ret < 0) goto out; @@ -6078,6 +6326,8 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) need_chmod = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) + need_fileattr = true; if ((old_size == sctx->cur_inode_size) || (sctx->cur_inode_size > old_size && sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) @@ -6121,6 +6371,12 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; } + if (need_fileattr) { + ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_fileattr); + if (ret < 0) + goto out; + } ret = send_capabilities(sctx); if (ret < 0) @@ -6161,8 +6417,13 @@ static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, { struct parent_paths_ctx *ppctx = ctx; - return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx, - ppctx->refs); + /* + * Pass 0 as the generation for the directory, we don't care about it + * here as we have no new references to add, we just want to delete all + * references for an inode. + */ + return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs, + name, dir, 0, ppctx->sctx); } /* @@ -6216,9 +6477,7 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) ret = send_unlink(sctx, ref->full_path); if (ret < 0) goto out; - fs_path_free(ref->full_path); - list_del(&ref->list); - kfree(ref); + recorded_ref_free(ref); } ret = 0; out: @@ -6265,7 +6524,7 @@ static int changed_inode(struct send_ctx *sctx, close_current_inode(sctx); sctx->cur_ino = key->objectid; - sctx->cur_inode_new_gen = 0; + sctx->cur_inode_new_gen = false; sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; @@ -6306,7 +6565,7 @@ static int changed_inode(struct send_ctx *sctx, */ if (left_gen != right_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) - sctx->cur_inode_new_gen = 1; + sctx->cur_inode_new_gen = true; } /* @@ -6338,8 +6597,8 @@ static int changed_inode(struct send_ctx *sctx, if (result == BTRFS_COMPARE_TREE_NEW) { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6350,8 +6609,8 @@ static int changed_inode(struct send_ctx *sctx, ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6369,8 +6628,8 @@ static int changed_inode(struct send_ctx *sctx, * First, process the inode as if it was deleted. */ sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6384,8 +6643,8 @@ static int changed_inode(struct send_ctx *sctx, * Now process the inode as if it was new. */ sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6417,9 +6676,9 @@ static int changed_inode(struct send_ctx *sctx, goto out; } else { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_new_gen = 0; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = false; + sctx->cur_inode_new_gen = false; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6532,12 +6791,12 @@ static int dir_changed(struct send_ctx *sctx, u64 dir) int ret; ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret) return ret; ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret) return ret; @@ -7518,7 +7777,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - xa_init_flags(&sctx->name_cache, GFP_KERNEL); + INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); sctx->flags = arg->flags; @@ -7533,6 +7792,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } else { sctx->proto = 1; } + if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { + ret = -EINVAL; + goto out; + } sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp) { @@ -7552,8 +7815,31 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->clone_roots_cnt = arg->clone_sources_count; - sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + if (sctx->proto >= 2) { + u32 send_buf_num_pages; + + sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; + sctx->send_buf_pages = kcalloc(send_buf_num_pages, + sizeof(*sctx->send_buf_pages), + GFP_KERNEL); + if (!sctx->send_buf_pages) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < send_buf_num_pages; i++) { + sctx->send_buf_pages[i] = + vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); + } + } else { + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + } if (!sctx->send_buf) { ret = -ENOMEM; goto out; @@ -7562,6 +7848,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, @@ -7746,6 +8034,7 @@ out: fput(sctx->send_filp); kvfree(sctx->clone_roots); + kfree(sctx->send_buf_pages); kvfree(sctx->send_buf); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 08602fdd600a..4bb4e6a638cb 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -7,12 +7,19 @@ #ifndef BTRFS_SEND_H #define BTRFS_SEND_H -#include "ctree.h" +#include <linux/types.h> #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" -#define BTRFS_SEND_STREAM_VERSION 1 +#define BTRFS_SEND_STREAM_VERSION 2 -#define BTRFS_SEND_BUF_SIZE SZ_64K +/* + * In send stream v1, no command is larger than 64K. In send stream v2, no limit + * should be assumed. + */ +#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K + +struct inode; +struct btrfs_ioctl_send_args; enum btrfs_tlv_type { BTRFS_TLV_U8, @@ -46,87 +53,117 @@ struct btrfs_tlv_header { /* commands */ enum btrfs_send_cmd { - BTRFS_SEND_C_UNSPEC, + BTRFS_SEND_C_UNSPEC = 0, /* Version 1 */ - BTRFS_SEND_C_SUBVOL, - BTRFS_SEND_C_SNAPSHOT, + BTRFS_SEND_C_SUBVOL = 1, + BTRFS_SEND_C_SNAPSHOT = 2, - BTRFS_SEND_C_MKFILE, - BTRFS_SEND_C_MKDIR, - BTRFS_SEND_C_MKNOD, - BTRFS_SEND_C_MKFIFO, - BTRFS_SEND_C_MKSOCK, - BTRFS_SEND_C_SYMLINK, + BTRFS_SEND_C_MKFILE = 3, + BTRFS_SEND_C_MKDIR = 4, + BTRFS_SEND_C_MKNOD = 5, + BTRFS_SEND_C_MKFIFO = 6, + BTRFS_SEND_C_MKSOCK = 7, + BTRFS_SEND_C_SYMLINK = 8, - BTRFS_SEND_C_RENAME, - BTRFS_SEND_C_LINK, - BTRFS_SEND_C_UNLINK, - BTRFS_SEND_C_RMDIR, + BTRFS_SEND_C_RENAME = 9, + BTRFS_SEND_C_LINK = 10, + BTRFS_SEND_C_UNLINK = 11, + BTRFS_SEND_C_RMDIR = 12, - BTRFS_SEND_C_SET_XATTR, - BTRFS_SEND_C_REMOVE_XATTR, + BTRFS_SEND_C_SET_XATTR = 13, + BTRFS_SEND_C_REMOVE_XATTR = 14, - BTRFS_SEND_C_WRITE, - BTRFS_SEND_C_CLONE, + BTRFS_SEND_C_WRITE = 15, + BTRFS_SEND_C_CLONE = 16, - BTRFS_SEND_C_TRUNCATE, - BTRFS_SEND_C_CHMOD, - BTRFS_SEND_C_CHOWN, - BTRFS_SEND_C_UTIMES, + BTRFS_SEND_C_TRUNCATE = 17, + BTRFS_SEND_C_CHMOD = 18, + BTRFS_SEND_C_CHOWN = 19, + BTRFS_SEND_C_UTIMES = 20, - BTRFS_SEND_C_END, - BTRFS_SEND_C_UPDATE_EXTENT, - __BTRFS_SEND_C_MAX_V1, + BTRFS_SEND_C_END = 21, + BTRFS_SEND_C_UPDATE_EXTENT = 22, + BTRFS_SEND_C_MAX_V1 = 22, /* Version 2 */ - __BTRFS_SEND_C_MAX_V2, + BTRFS_SEND_C_FALLOCATE = 23, + BTRFS_SEND_C_FILEATTR = 24, + BTRFS_SEND_C_ENCODED_WRITE = 25, + BTRFS_SEND_C_MAX_V2 = 25, /* End */ - __BTRFS_SEND_C_MAX, + BTRFS_SEND_C_MAX = 25, }; -#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) /* attributes in send stream */ enum { - BTRFS_SEND_A_UNSPEC, - - BTRFS_SEND_A_UUID, - BTRFS_SEND_A_CTRANSID, - - BTRFS_SEND_A_INO, - BTRFS_SEND_A_SIZE, - BTRFS_SEND_A_MODE, - BTRFS_SEND_A_UID, - BTRFS_SEND_A_GID, - BTRFS_SEND_A_RDEV, - BTRFS_SEND_A_CTIME, - BTRFS_SEND_A_MTIME, - BTRFS_SEND_A_ATIME, - BTRFS_SEND_A_OTIME, - - BTRFS_SEND_A_XATTR_NAME, - BTRFS_SEND_A_XATTR_DATA, - - BTRFS_SEND_A_PATH, - BTRFS_SEND_A_PATH_TO, - BTRFS_SEND_A_PATH_LINK, - - BTRFS_SEND_A_FILE_OFFSET, - BTRFS_SEND_A_DATA, - - BTRFS_SEND_A_CLONE_UUID, - BTRFS_SEND_A_CLONE_CTRANSID, - BTRFS_SEND_A_CLONE_PATH, - BTRFS_SEND_A_CLONE_OFFSET, - BTRFS_SEND_A_CLONE_LEN, - - __BTRFS_SEND_A_MAX, + BTRFS_SEND_A_UNSPEC = 0, + + /* Version 1 */ + BTRFS_SEND_A_UUID = 1, + BTRFS_SEND_A_CTRANSID = 2, + + BTRFS_SEND_A_INO = 3, + BTRFS_SEND_A_SIZE = 4, + BTRFS_SEND_A_MODE = 5, + BTRFS_SEND_A_UID = 6, + BTRFS_SEND_A_GID = 7, + BTRFS_SEND_A_RDEV = 8, + BTRFS_SEND_A_CTIME = 9, + BTRFS_SEND_A_MTIME = 10, + BTRFS_SEND_A_ATIME = 11, + BTRFS_SEND_A_OTIME = 12, + + BTRFS_SEND_A_XATTR_NAME = 13, + BTRFS_SEND_A_XATTR_DATA = 14, + + BTRFS_SEND_A_PATH = 15, + BTRFS_SEND_A_PATH_TO = 16, + BTRFS_SEND_A_PATH_LINK = 17, + + BTRFS_SEND_A_FILE_OFFSET = 18, + /* + * As of send stream v2, this attribute is special: it must be the last + * attribute in a command, its header contains only the type, and its + * length is implicitly the remaining length of the command. + */ + BTRFS_SEND_A_DATA = 19, + + BTRFS_SEND_A_CLONE_UUID = 20, + BTRFS_SEND_A_CLONE_CTRANSID = 21, + BTRFS_SEND_A_CLONE_PATH = 22, + BTRFS_SEND_A_CLONE_OFFSET = 23, + BTRFS_SEND_A_CLONE_LEN = 24, + + BTRFS_SEND_A_MAX_V1 = 24, + + /* Version 2 */ + BTRFS_SEND_A_FALLOCATE_MODE = 25, + + /* + * File attributes from the FS_*_FL namespace (i_flags, xflags), + * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored + * in btrfs_inode_item::flags (represented by btrfs_inode::flags and + * btrfs_inode::ro_flags). + */ + BTRFS_SEND_A_FILEATTR = 26, + + BTRFS_SEND_A_UNENCODED_FILE_LEN = 27, + BTRFS_SEND_A_UNENCODED_LEN = 28, + BTRFS_SEND_A_UNENCODED_OFFSET = 29, + /* + * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from + * BTRFS_SEND_C_ENCODED_WRITE. + */ + BTRFS_SEND_A_COMPRESSION = 30, + BTRFS_SEND_A_ENCRYPTION = 31, + BTRFS_SEND_A_MAX_V2 = 31, + + /* End */ + BTRFS_SEND_A_MAX = 31, }; -#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) -#ifdef __KERNEL__ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); -#endif #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2dd8754cb990..d0cbeb7ae81c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -9,6 +9,7 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -187,6 +188,37 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +/* + * Calculate chunk size depending on volume type (regular or zoned). + */ +static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) +{ + if (btrfs_is_zoned(fs_info)) + return fs_info->zone_size; + + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + return SZ_1G; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return SZ_32M; + + /* Handle BTRFS_BLOCK_GROUP_METADATA */ + if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) + return SZ_1G; + + return SZ_256M; +} + +/* + * Update default chunk size. + */ +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size) +{ + WRITE_ONCE(space_info->chunk_size, chunk_size); +} + static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -208,6 +240,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; + btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; @@ -263,7 +296,7 @@ out: void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info) + bool active, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; int factor; @@ -274,6 +307,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, ASSERT(found); spin_lock(&found->lock); found->total_bytes += total_bytes; + if (active) + found->active_total_bytes += total_bytes; found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; @@ -337,6 +372,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } +static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + /* + * On regular filesystem, all total_bytes are always writable. On zoned + * filesystem, there may be a limitation imposed by max_active_zones. + * For metadata allocation, we cannot finish an existing active block + * group to avoid a deadlock. Thus, we need to consider only the active + * groups to be writable for metadata space. + */ + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return space_info->total_bytes; + + return space_info->active_total_bytes; +} + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) @@ -349,9 +400,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); + if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + avail = 0; + else + avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < space_info->total_bytes + avail) + if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) return 1; return 0; } @@ -387,7 +441,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || + if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, @@ -671,6 +725,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: + /* + * For metadata space on zoned filesystem, reaching here means we + * don't have enough space left in active_total_bytes. Try to + * activate a block group first, because we may have inactive + * block group already allocated. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); + if (ret < 0) + break; + else if (ret == 1) + break; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -681,6 +747,23 @@ static void flush_space(struct btrfs_fs_info *fs_info, (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); + + /* + * For metadata space on zoned filesystem, allocating a new chunk + * is not enough. We still need to activate the block * group. + * Active the newly allocated block group by (maybe) finishing + * a block group. + */ + if (ret == 1) { + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + /* + * Revert to the original ret regardless we could finish + * one block group or not. + */ + if (ret >= 0) + ret = 1; + } + if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -718,6 +801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; + u64 total; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -732,8 +816,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, * space. If that's the case add in our overage so we make sure to put * appropriate pressure on the flushing state machine. */ - if (space_info->total_bytes + avail < used) - to_reclaim += used - (space_info->total_bytes + avail); + total = writable_total_bytes(fs_info, space_info); + if (total + avail < used) + to_reclaim += used - (total + avail); return to_reclaim; } @@ -743,9 +828,12 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 90); + u64 total = writable_total_bytes(fs_info, space_info); + u64 thresh; u64 used; + thresh = div_factor_fine(total, 90); + lockdep_assert_held(&space_info->lock); /* If we're just plain full then async reclaim just slows us down. */ @@ -807,8 +895,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; - if (used < space_info->total_bytes) - thresh += space_info->total_bytes - used; + if (used < total) + thresh += total - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -1280,7 +1368,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still - * because we may have only satisified the priority tickets and still + * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ @@ -1525,7 +1613,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * can_overcommit() to ensure we can overcommit to continue. */ if (!pending_tickets && - ((used + orig_bytes <= space_info->total_bytes) || + ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c096695598c1..12fd6147f92d 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -19,12 +19,16 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + /* Total bytes in the space, but only accounts active block groups. */ + u64 active_total_bytes; u64 bytes_zone_unusable; /* total bytes that are unusable until resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ + /* Chunk size in bytes */ + u64 chunk_size; /* * Once a block group drops below this threshold (percents) we'll @@ -122,7 +126,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info); + bool active, struct btrfs_space_info **space_info); +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index f429256f56db..12455b2b41de 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -12,15 +12,10 @@ static bool check_setget_bounds(const struct extent_buffer *eb, { const unsigned long member_offset = (unsigned long)ptr + off; - if (member_offset > eb->len) { + if (unlikely(member_offset + size > eb->len)) { btrfs_warn(eb->fs_info, - "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d", - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - if (member_offset + size > eb->len) { - btrfs_warn(eb->fs_info, - "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d", + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), (unsigned long)ptr, eb->start, member_offset, size); return false; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index a105b291444f..6fc2b77ae5c3 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -123,7 +123,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* - * We have cases like a dummy extent buffer page, which is not mappped + * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ if (page->mapping) @@ -731,7 +731,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, * It should not have any subpage::writers count. * Can be unlocked by unlock_page(). * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages() or extent_write_full_page(). + * inside extent_write_cache_pages(). * Rarer cases include the @locked_page from extent_write_locked_range(). * * - Page locked by lock_delalloc_pages() diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b1fdc6a26c76..4c7089b1681b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,6 +48,7 @@ #include "block-group.h" #include "discard.h" #include "qgroup.h" +#include "raid56.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -72,7 +73,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data); #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) /* - * Characters to print to indicate error conditions or uncommon filesystem sate. + * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { @@ -763,6 +764,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force = false; no_compress++; } else { + btrfs_err(info, "unrecognized compression value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -821,8 +824,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_thread_pool: ret = match_int(&args[0], &intarg); if (ret) { + btrfs_err(info, "unrecognized thread_pool value %s", + args[0].from); goto out; } else if (intarg == 0) { + btrfs_err(info, "invalid value 0 for thread_pool"); ret = -EINVAL; goto out; } @@ -883,8 +889,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ratio: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized metadata_ratio value %s", + args[0].from); goto out; + } info->metadata_ratio = intarg; btrfs_info(info, "metadata ratio %u", info->metadata_ratio); @@ -901,6 +910,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, DISCARD_ASYNC, "turning on async discard"); } else { + btrfs_err(info, "unrecognized discard mode value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -933,6 +944,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, FREE_SPACE_TREE, "enabling free space tree"); } else { + btrfs_err(info, "unrecognized space_cache value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1014,8 +1027,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_check_integrity_print_mask: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, + "unrecognized check_integrity_print_mask value %s", + args[0].from); goto out; + } info->check_integrity_print_mask = intarg; btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); @@ -1030,13 +1047,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) + if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) + } else if (strcmp(args[0].from, "bug") == 0) { btrfs_clear_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else { + } else { + btrfs_err(info, "unrecognized fatal_errors value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1044,8 +1063,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_commit_interval: intarg = 0; ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized commit_interval value %s", + args[0].from); + ret = -EINVAL; goto out; + } if (intarg == 0) { btrfs_info(info, "using default commit interval %us", @@ -1059,8 +1082,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_rescue: ret = parse_rescue_options(info, args[0].from); - if (ret < 0) + if (ret < 0) { + btrfs_err(info, "unrecognized rescue value %s", + args[0].from); goto out; + } break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: @@ -1906,10 +1932,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, - new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); @@ -1985,6 +2007,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + /* V1 cache is not supported for subpage mount. */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + ret = -EINVAL; + goto restore; + } btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); @@ -2213,12 +2243,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID1C3) - num_stripes = 3; - else if (type & BTRFS_BLOCK_GROUP_RAID1C4) - num_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK) + num_stripes = rattr->ncopies; else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; @@ -2242,17 +2268,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* - * In order to avoid overwriting the superblock on the drive, - * btrfs starts at an offset of at least 1MB when doing chunk - * allocation. - * - * This ensures we have at least min_stripe_size free space - * after excluding 1MB. + * Ensure we have at least min_stripe_size on top of the + * reserved space on the device. */ - if (avail_space <= SZ_1M + min_stripe_size) + if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size) continue; - avail_space -= SZ_1M; + avail_space -= BTRFS_DEVICE_RANGE_RESERVED; devices_info[i].dev = device; devices_info[i].max_avail = avail_space; @@ -2670,13 +2692,9 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_ref; - err = btrfs_end_io_wq_init(); - if (err) - goto free_prelim_ref; - err = btrfs_interface_init(); if (err) - goto free_end_io_wq; + goto free_prelim_ref; btrfs_print_mod_info(); @@ -2692,8 +2710,6 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); -free_end_io_wq: - btrfs_end_io_wq_exit(); free_prelim_ref: btrfs_prelim_ref_exit(); free_delayed_ref: @@ -2731,7 +2747,6 @@ static void __exit exit_btrfs_fs(void) extent_state_cache_exit(); extent_io_exit(); btrfs_interface_exit(); - btrfs_end_io_wq_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 92a1fa8e3da6..d5d0717fd09a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,6 +21,7 @@ #include "space-info.h" #include "block-group.h" #include "qgroup.h" +#include "misc.h" /* * Structure name Path @@ -61,6 +62,10 @@ struct raid_kobject { .store = _store, \ } +#define BTRFS_ATTR_W(_prefix, _name, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) @@ -92,6 +97,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct kobject *get_btrfs_kobj(struct kobject *kobj); static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) { @@ -270,12 +276,10 @@ static umode_t btrfs_feature_visible(struct kobject *kobj, return mode; } -BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); -BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); @@ -283,9 +287,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); -#ifdef CONFIG_BTRFS_DEBUG -/* Remove once support for zoned allocation is feature complete */ +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +#endif +#ifdef CONFIG_BTRFS_DEBUG /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); #endif @@ -296,17 +301,15 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); /* * Features which depend on feature bits and may differ between each fs. * - * /sys/fs/btrfs/features - all available features implemeted by this version + * /sys/fs/btrfs/features - all available features implemented by this version * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or * can be changed on a mounted filesystem. */ static struct attribute *btrfs_supported_feature_attrs[] = { - BTRFS_FEAT_ATTR_PTR(mixed_backref), BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), BTRFS_FEAT_ATTR_PTR(compress_zstd), - BTRFS_FEAT_ATTR_PTR(big_metadata), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), @@ -314,8 +317,10 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), +#endif +#ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(extent_tree_v2), #endif #ifdef CONFIG_FS_VERITY @@ -709,6 +714,112 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) +static ssize_t btrfs_chunk_size_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + + return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); +} + +/* + * Store new chunk size in space info. Can be called on a read-only filesystem. + * + * If the new chunk size value is larger than 10% of free space it is reduced + * to match that limit. Alignment must be to 256M and the system chunk size + * cannot be set. + */ +static ssize_t btrfs_chunk_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + char *retptr; + u64 val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!fs_info->fs_devices) + return -EINVAL; + + if (btrfs_is_zoned(fs_info)) + return -EINVAL; + + /* System block type must not be changed. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return -EPERM; + + val = memparse(buf, &retptr); + /* There could be trailing '\n', also catch any typos after the value */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || val == 0) + return -EINVAL; + + val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); + + /* Limit stripe size to 10% of available space. */ + val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + + /* Must be multiple of 256M. */ + val &= ~((u64)SZ_256M - 1); + + /* Must be at least 256M. */ + if (val < SZ_256M) + return -EINVAL; + + btrfs_update_space_info_chunk_size(space_info, val); + + return len; +} + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Request chunk allocation with current chunk size. + */ +static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + struct btrfs_trans_handle *trans; + bool val; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return -EINVAL; + + /* + * This is unsafe to be called from sysfs context and may cause + * unexpected problems. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_force_chunk_alloc(trans, space_info->flags); + btrfs_end_transaction(trans); + + if (ret == 1) + return len; + + return -ENOSPC; +} +BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); + +#endif + SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -719,6 +830,7 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, @@ -773,6 +885,10 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, chunk_size), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(space_info, force_chunk_alloc), +#endif NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -871,6 +987,48 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); +static ssize_t btrfs_commit_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, + "commits %llu\n" + "last_commit_ms %llu\n" + "max_commit_ms %llu\n" + "total_commit_ms %llu\n", + fs_info->commit_stats.commit_count, + div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); +} + +static ssize_t btrfs_commit_stats_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long val; + int ret; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return ret; + if (val) + return -EINVAL; + + WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); + + return len; +} +BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); + static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1110,6 +1268,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), + BTRFS_ATTR_PTR(, commit_stats), NULL, }; @@ -1140,6 +1299,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) return to_fs_devs(kobj)->fs_info; } +static struct kobject *get_btrfs_kobj(struct kobject *kobj) +{ + while (kobj) { + if (kobj->ktype == &btrfs_ktype) + return kobj; + kobj = kobj->parent; + } + return NULL; +} + #define NUM_FEATURE_BITS 64 #define BTRFS_FEATURE_NAME_MAX 13 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; @@ -2106,4 +2275,3 @@ void __cold btrfs_exit_sysfs(void) #endif kset_unregister(btrfs_kset); } - diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 1591bfa55bcc..cc9377cf56a3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -59,6 +59,7 @@ struct inode *btrfs_new_test_inode(void) return NULL; inode->i_mode = S_IFREG; + inode->i_ino = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; @@ -150,8 +151,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - unsigned long index; - struct extent_buffer *eb; + struct radix_tree_iter iter; + void **slot; struct btrfs_device *dev, *tmp; if (!fs_info) @@ -163,9 +164,25 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - xa_for_each(&fs_info->extent_buffers, index, eb) { + spin_lock(&fs_info->buffer_lock); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + slot = radix_tree_iter_retry(&iter); + continue; + } + slot = radix_tree_iter_resume(slot, &iter); + spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); } + spin_unlock(&fs_info->buffer_lock); btrfs_mapping_tree_free(&fs_info->mapping_tree); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, @@ -186,7 +203,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) if (!root) return; /* Will be freed by btrfs_free_fs_roots */ - if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state))) + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; btrfs_global_root_delete(root); btrfs_put_root(root); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 51a8b075c259..b7d181a08eab 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -47,7 +47,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, nodesize); + path->nodes[0] = eb; if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 06c0a958d114..0bec10740ad3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/uuid.h> +#include <linux/timekeeping.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -23,7 +24,7 @@ #include "space-info.h" #include "zoned.h" -#define BTRFS_ROOT_TRANS_TAG XA_MARK_0 +#define BTRFS_ROOT_TRANS_TAG 0 /* * Transaction states and transitions @@ -437,15 +438,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, */ smp_wmb(); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid && !force) { - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } - xa_set_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); + radix_tree_tag_set(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to @@ -487,9 +488,11 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, @@ -1402,8 +1405,9 @@ void btrfs_add_dead_root(struct btrfs_root *root) static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root; - unsigned long index; + struct btrfs_root *gang[8]; + int i; + int ret; /* * At this point no one can be using this transaction to modify any tree @@ -1411,46 +1415,57 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); - spin_lock(&fs_info->fs_roots_lock); - xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) { - int ret; - - /* - * At this point we can neither have tasks logging inodes - * from a root nor trying to commit a log tree. - */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); - - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); - - btrfs_free_log(trans, root); - ret = btrfs_update_reloc_root(trans, root); - if (ret) - return ret; - - /* See comments in should_cow_block() */ - clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_mb__after_atomic(); + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + int ret2; + + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); + + btrfs_free_log(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; + + /* see comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); + + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, + root->node); + } - if (root->commit_root != root->node) { - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); - btrfs_set_root_node(&root->root_item, root->node); + ret2 = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + if (ret2) + return ret2; + spin_lock(&fs_info->fs_roots_radix_lock); + btrfs_qgroup_free_meta_all_pertrans(root); } - - ret = btrfs_update_root(trans, fs_info->tree_root, - &root->root_key, &root->root_item); - if (ret) - return ret; - spin_lock(&fs_info->fs_roots_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } @@ -1817,8 +1832,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + dentry->d_name.len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = - current_time(parent_inode); + parent_inode->i_mtime = current_time(parent_inode); + parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2084,12 +2099,23 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } +static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +{ + fs_info->commit_stats.commit_count++; + fs_info->commit_stats.last_commit_dur = interval; + fs_info->commit_stats.max_commit_dur = + max_t(u64, fs_info->commit_stats.max_commit_dur, interval); + fs_info->commit_stats.total_commit_dur += interval; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; + ktime_t start_time; + ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); @@ -2214,6 +2240,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } } + /* + * Get the time spent on the work done by the commit thread and not + * the time spent waiting on a previous commit + */ + start_time = ktime_get_ns(); + extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); @@ -2455,6 +2487,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trace_btrfs_transaction_commit(fs_info); + interval = ktime_get_ns() - start_time; + btrfs_scrub_continue(fs_info); if (current->journal_info == trans) @@ -2462,6 +2496,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); + update_commit_stats(fs_info, interval); + return ret; unlock_reloc: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 370388fadf96..dcf75a8daa20 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -171,7 +171,7 @@ again: int index = (root->log_transid + 1) % 2; if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -194,7 +194,7 @@ again: * writing. */ if (zoned && !created) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -2287,7 +2287,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_key location; /* - * Currenly we only log dir index keys. Even if we replay a log created + * Currently we only log dir index keys. Even if we replay a log created * by an older kernel that logged both dir index and dir item keys, all * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). @@ -3121,7 +3121,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; mutex_unlock(&root->log_mutex); goto out; } @@ -3222,7 +3222,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -3261,7 +3261,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_wake_log_root; } @@ -5848,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, inode_only == LOG_INODE_ALL && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } @@ -6562,12 +6562,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, bool log_dentries = false; if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } if (btrfs_root_refs(&root->root_item) == 0) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } @@ -6665,7 +6665,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } if (ret) @@ -7029,8 +7029,15 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * anyone from syncing the log until we have updated both inodes * in the log. */ + ret = join_running_log_trans(root); + /* + * At least one of the inodes was logged before, so this should + * not fail, but if it does, it's not serious, just bail out and + * mark the log for a full commit. + */ + if (WARN_ON_ONCE(ret < 0)) + goto out; log_pinned = true; - btrfs_pin_log_trans(root); path = btrfs_alloc_path(); if (!path) { diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 1620f8170629..57ab5f3b8dc7 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -12,6 +12,9 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +/* We can't use the tree log for whatever reason, force a transaction commit */ +#define BTRFS_LOG_FORCE_COMMIT (1) + struct btrfs_log_ctx { int log_ret; int log_transid; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c20049d1fec..272901514b0c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -182,6 +182,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags) return btrfs_raid_array[index].raid_name; } +int btrfs_nr_parity_stripes(u64 type) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); + + return btrfs_raid_array[index].nparity; +} + /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. @@ -238,7 +245,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -1396,12 +1402,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - /* - * We don't want to overwrite the superblock on the drive nor - * any area used by the boot loader (grub for example), so we - * make sure to start at an offset of at least 1MB. - */ - return max_t(u64, start, SZ_1M); + return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular @@ -5071,26 +5072,16 @@ static void init_alloc_chunk_ctl_policy_regular( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { - u64 type = ctl->type; + struct btrfs_space_info *space_info; - if (type & BTRFS_BLOCK_GROUP_DATA) { - ctl->max_stripe_size = SZ_1G; - ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* For larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - ctl->max_stripe_size = SZ_1G; - else - ctl->max_stripe_size = SZ_256M; - ctl->max_chunk_size = ctl->max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - ctl->max_stripe_size = SZ_32M; - ctl->max_chunk_size = 2 * ctl->max_stripe_size; - ctl->devs_max = min_t(int, ctl->devs_max, - BTRFS_MAX_DEVS_SYS_CHUNK); - } else { - BUG(); - } + space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); + ASSERT(space_info); + + ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); + ctl->max_stripe_size = ctl->max_chunk_size; + + if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) + ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), @@ -5720,7 +5711,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct extent_map *em; struct map_lookup *map; - int ret; + enum btrfs_raid_types index; + int ret = 1; em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) @@ -5733,10 +5725,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) - ret = map->num_stripes; - else if (map->type & BTRFS_BLOCK_GROUP_RAID10) - ret = map->sub_stripes; + index = btrfs_bg_flags_to_raid_index(map->type); + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) + ret = btrfs_raid_array[index].ncopies; else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) @@ -5748,8 +5741,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * stripe under reconstruction. */ ret = map->num_stripes; - else - ret = 1; free_extent_map(em); down_read(&fs_info->dev_replace.rwsem); @@ -5768,6 +5759,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return len; + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { @@ -5785,6 +5779,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return 0; + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { @@ -5917,18 +5914,17 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc) kfree(bioc); } -/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ /* * Please note that, discard won't be sent to target device of device * replace. */ -static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 *length_ret, - struct btrfs_io_context **bioc_ret) +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes) { struct extent_map *em; struct map_lookup *map; - struct btrfs_io_context *bioc; + struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; u64 stripe_nr; @@ -5937,29 +5933,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, u64 stripe_cnt; u64 stripe_len; u64 stripe_offset; - u64 num_stripes; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; - int ret = 0; + int ret; int i; - /* Discard always returns a bioc. */ - ASSERT(bioc_ret); - em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) - return PTR_ERR(em); + return ERR_CAST(em); map = em->map_lookup; + /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; - goto out; - } + goto out_free_map; +} offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); @@ -5985,7 +5978,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - num_stripes = 1; + *num_stripes = 1; stripe_index = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { @@ -5995,7 +5988,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, sub_stripes = map->sub_stripes; factor = map->num_stripes / sub_stripes; - num_stripes = min_t(u64, map->num_stripes, + *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= sub_stripes; @@ -6005,31 +5998,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, last_stripe *= sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = map->num_stripes; + *num_stripes = map->num_stripes; } else { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); } - bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); - if (!bioc) { + stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); + if (!stripes) { ret = -ENOMEM; - goto out; + goto out_free_map; } - for (i = 0; i < num_stripes; i++) { - bioc->stripes[i].physical = + for (i = 0; i < *num_stripes; i++) { + stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bioc->stripes[i].dev = map->stripes[stripe_index].dev; + stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - bioc->stripes[i].length = stripes_per_dev * - map->stripe_len; + stripes[i].length = stripes_per_dev * map->stripe_len; if (i / sub_stripes < remaining_stripes) - bioc->stripes[i].length += map->stripe_len; + stripes[i].length += map->stripe_len; /* * Special for the first stripe and @@ -6040,17 +6032,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * off end_off */ if (i < sub_stripes) - bioc->stripes[i].length -= stripe_offset; + stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) - bioc->stripes[i].length -= stripe_end_offset; + stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { - bioc->stripes[i].length = length; + stripes[i].length = length; } stripe_index++; @@ -6060,12 +6052,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } } - *bioc_ret = bioc; - bioc->map_type = map->type; - bioc->num_stripes = num_stripes; -out: free_extent_map(em); - return ret; + return stripes; +out_free_map: + free_extent_map(em); + return ERR_PTR(ret); } /* @@ -6208,7 +6199,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + i; new->physical = old->physical; - new->length = old->length; new->dev = dev_replace->tgtdev; bioc->tgtdev_map[i] = index_where_to_add; index_where_to_add++; @@ -6249,8 +6239,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + num_stripes; tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bioc->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; bioc->tgtdev_map[index_srcdev] = num_stripes; @@ -6472,6 +6460,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, @@ -6479,9 +6468,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; - max_errors = nr_parity_stripes(map); + max_errors = btrfs_chunk_max_errors(map); - *length = map->stripe_len; + /* Return the length to the full stripe end */ + *length = min(logical + *length, + raid56_full_stripe_start + em->start + + data_stripes * stripe_len) - logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6604,10 +6596,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, int mirror_num) { - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - length, bioc_ret); - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, mirror_num, 0); } @@ -6620,77 +6608,106 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); } -static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc) +{ + if (bioc->orig_bio->bi_opf & REQ_META) + return bioc->fs_info->endio_meta_workers; + return bioc->fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = + container_of(work, struct btrfs_bio, end_io_work); + + bio_endio(&bbio->bio); +} + +static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async) { - bio->bi_private = bioc->private; - bio->bi_end_io = bioc->end_io; - bio_endio(bio); + struct bio *orig_bio = bioc->orig_bio; + struct btrfs_bio *bbio = btrfs_bio(orig_bio); + + bbio->mirror_num = bioc->mirror_num; + orig_bio->bi_private = bioc->private; + orig_bio->bi_end_io = bioc->end_io; + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + orig_bio->bi_status = BLK_STS_IOERR; + else + orig_bio->bi_status = BLK_STS_OK; + + if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work); + } else { + bio_endio(orig_bio); + } btrfs_put_bioc(bioc); } static void btrfs_end_bio(struct bio *bio) { - struct btrfs_io_context *bioc = bio->bi_private; - int is_orig_bio = 0; + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; if (bio->bi_status) { atomic_inc(&bioc->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - struct btrfs_device *dev = btrfs_bio(bio)->device; - - ASSERT(dev->bdev); if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_FLUSH_ERRS); } } - if (bio == bioc->orig_bio) - is_orig_bio = 1; + if (bio != bioc->orig_bio) + bio_put(bio); btrfs_bio_counter_dec(bioc->fs_info); - - if (atomic_dec_and_test(&bioc->stripes_pending)) { - if (!is_orig_bio) { - bio_put(bio); - bio = bioc->orig_bio; - } - - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - /* only send an error to the higher layers if it is - * beyond the tolerance of the btrfs bio - */ - if (atomic_read(&bioc->error) > bioc->max_errors) { - bio->bi_status = BLK_STS_IOERR; - } else { - /* - * this bio is actually up to date, we didn't - * go over the max number of errors - */ - bio->bi_status = BLK_STS_OK; - } - - btrfs_end_bioc(bioc, bio); - } else if (!is_orig_bio) { - bio_put(bio); - } + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc, true); } -static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, - u64 physical, struct btrfs_device *dev) +static void submit_stripe_bio(struct btrfs_io_context *bioc, + struct bio *orig_bio, int dev_nr, bool clone) { struct btrfs_fs_info *fs_info = bioc->fs_info; + struct btrfs_device *dev = bioc->stripes[dev_nr].dev; + u64 physical = bioc->stripes[dev_nr].physical; + struct bio *bio; + + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(orig_bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + atomic_inc(&bioc->error); + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc, false); + return; + } + + if (clone) { + bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set); + } else { + bio = orig_bio; + bio_set_dev(bio, dev->bdev); + btrfs_bio(bio)->device = dev; + } - bio->bi_private = bioc; - btrfs_bio(bio)->device = dev; + bioc->stripes[dev_nr].bioc = bioc; + bio->bi_private = &bioc->stripes[dev_nr]; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; /* @@ -6708,8 +6725,8 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, } } btrfs_debug_in_rcu(fs_info, - "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); @@ -6719,66 +6736,39 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, submit_bio(bio); } -static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) -{ - atomic_inc(&bioc->error); - if (atomic_dec_and_test(&bioc->stripes_pending)) { - /* Should be the original bio. */ - WARN_ON(bio != bioc->orig_bio); - - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - bio->bi_iter.bi_sector = logical >> 9; - if (atomic_read(&bioc->error) > bioc->max_errors) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_status = BLK_STS_OK; - btrfs_end_bioc(bioc, bio); - } -} - -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num) +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { - struct btrfs_device *dev; - struct bio *first_bio = bio; u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = 0; - u64 map_length; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; int ret; int dev_nr; int total_devs; struct btrfs_io_context *bioc = NULL; - length = bio->bi_iter.bi_size; - map_length = length; - btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bioc, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + return; } total_devs = bioc->num_stripes; - bioc->orig_bio = first_bio; - bioc->private = first_bio->bi_private; - bioc->end_io = first_bio->bi_end_io; - atomic_set(&bioc->stripes_pending, bioc->num_stripes); + bioc->orig_bio = bio; + bioc->private = bio->bi_private; + bioc->end_io = bio->bi_end_io; + atomic_set(&bioc->stripes_pending, total_devs); if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { - /* In this case, map_length has been set to the length of - a single stripe; not the whole write */ - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(bio, bioc, map_length); - } else { - ret = raid56_parity_recover(bio, bioc, map_length, - mirror_num, 1); - } - - btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + raid56_parity_write(bio, bioc); + else + raid56_parity_recover(bio, bioc, mirror_num, true); + return; } if (map_length < length) { @@ -6789,26 +6779,11 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - dev = bioc->stripes[dev_nr].dev; - if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, - &dev->dev_state) || - (btrfs_op(first_bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bioc_error(bioc, first_bio, logical); - continue; - } - - if (dev_nr < total_devs - 1) { - bio = btrfs_bio_clone(dev->bdev, first_bio); - } else { - bio = first_bio; - bio_set_dev(bio, dev->bdev); - } + const bool should_clone = (dev_nr < total_devs - 1); - submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); + submit_stripe_bio(bioc, bio, dev_nr, should_clone); } btrfs_bio_counter_dec(fs_info); - return BLK_STS_OK; } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, @@ -6966,11 +6941,12 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +u64 btrfs_calc_stripe_length(const struct extent_map *em) { - const int data_stripes = calc_data_stripes(type, num_stripes); + const struct map_lookup *map = em->map_lookup; + const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(chunk_len, data_stripes); + return div_u64(em->len, data_stripes); } #if BITS_PER_LONG == 32 @@ -7109,8 +7085,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->type = type; map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; - em->orig_block_len = calc_stripe_length(type, em->len, - map->num_stripes); + em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -7236,7 +7211,8 @@ static int read_one_dev(struct extent_buffer *leaf, u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - devid = args.devid = btrfs_device_id(leaf, dev_item); + devid = btrfs_device_id(leaf, dev_item); + args.devid = devid; read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), @@ -7865,11 +7841,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); - btrfs_dev_stat_print_on_error(dev); -} -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) -{ if (!dev->dev_stats_valid) return; btrfs_err_rl_in_rcu(dev->fs_info, @@ -8011,7 +7983,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } map = em->map_lookup; - stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); + stripe_len = btrfs_calc_stripe_length(em); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", @@ -8021,6 +7993,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } + /* + * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * space. Although kernel can handle it without problem, better to warn + * the users. + */ + if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) + btrfs_warn(fs_info, + "devid %llu physical %llu len %llu inside the reserved space", + devid, physical_offset, physical_len); + for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->devid == devid && map->stripes[i].physical == physical_offset) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6721002000ee..5639961b3626 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -355,6 +355,13 @@ struct btrfs_fs_devices { / sizeof(struct btrfs_stripe) + 1) /* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +/* * Additional info to pass along bio. * * Mostly for btrfs specific features like csum and mirror_num. @@ -371,6 +378,9 @@ struct btrfs_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; struct bvec_iter iter; + /* For read end I/O handling */ + struct work_struct end_io_work; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -391,10 +401,36 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) } } +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + struct btrfs_io_stripe { struct btrfs_device *dev; + union { + /* Block mapping */ + u64 physical; + /* For the endio handler */ + struct btrfs_io_context *bioc; + }; +}; + +struct btrfs_discard_stripe { + struct btrfs_device *dev; u64 physical; - u64 length; /* only used for discard mappings */ + u64 length; }; /* @@ -533,6 +569,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); @@ -541,8 +580,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, @@ -601,6 +639,8 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); +u64 btrfs_calc_stripe_length(const struct extent_map *em); +int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 767a0c6c9694..b4f44662cda7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - char *data_in; + char *data_in = NULL; char *cpage_out; int nr_pages = 0; struct page *in_page = NULL; @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,26 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,9 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -207,7 +205,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -234,9 +232,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -245,7 +241,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -264,13 +260,11 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } + return ret; } @@ -287,7 +281,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -309,7 +303,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -336,13 +330,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -355,7 +349,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) done: zlib_inflateEnd(&workspace->strm); if (data_in) - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); if (!ret) zero_fill_bio(cb->orig_bio); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11237a913bee..b150b07ba1a7 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -94,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * Possible states of log buffer zones * * Empty[0] In use[0] Full[0] - * Empty[1] * x 0 - * In use[1] 0 x 0 - * Full[1] 1 1 C + * Empty[1] * 0 1 + * In use[1] x x 1 + * Full[1] 0 0 C * * Log position: * *: Special case, no superblock is written @@ -415,6 +415,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + /* + * We limit max_zone_append_size also by max_segments * + * PAGE_SIZE. Technically, we can have multiple pages per segment. But, + * since btrfs adds the pages one by one to a bio, and btrfs cannot + * increase the metadata reservation even if it increases the number of + * extents, it is safe to stick with the limit. + */ + zone_info->max_zone_append_size = + min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -640,6 +650,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 zoned_devices = 0; u64 nr_devices = 0; u64 zone_size = 0; + u64 max_zone_append_size = 0; const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; @@ -674,6 +685,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) ret = -EINVAL; goto out; } + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = + zone_info->max_zone_append_size; } nr_devices++; } @@ -723,7 +739,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; + fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + if (fs_info->max_zone_append_size < fs_info->max_extent_size) + fs_info->max_extent_size = fs_info->max_zone_append_size; /* * Check mount options here, because we might change fs_info->zoned @@ -1735,12 +1755,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc); if (ret || !bioc || mapped_length < PAGE_SIZE) { - btrfs_put_bioc(bioc); - return -EIO; + ret = -EIO; + goto out_put_bioc; } - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) - return -EINVAL; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EINVAL; + goto out_put_bioc; + } nofs_flag = memalloc_nofs_save(); nmirrors = (int)bioc->num_stripes; @@ -1759,7 +1781,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, break; } memalloc_nofs_restore(nofs_flag); - +out_put_bioc: + btrfs_put_bioc(bioc); return ret; } @@ -1826,6 +1849,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; @@ -1837,6 +1861,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (block_group->zone_is_active) { ret = true; @@ -1865,7 +1890,10 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* Successfully activated all the zones */ block_group->zone_is_active = 1; + space_info->active_total_bytes += block_group->length; spin_unlock(&block_group->lock); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); @@ -1878,6 +1906,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); return ret; } @@ -1885,7 +1914,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; - bool need_zone_finish; int ret = 0; int i; @@ -1942,12 +1970,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ } } - /* - * The block group is not fully allocated, so not fully written yet. We - * need to send ZONE_FINISH command to free up an active zone. - */ - need_zone_finish = !btrfs_zoned_bg_is_full(block_group); - block_group->zone_is_active = 0; block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; @@ -1963,15 +1985,13 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ if (device->zone_info->max_active_zones == 0) continue; - if (need_zone_finish) { - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); - if (ret) - return ret; - } + if (ret) + return ret; btrfs_dev_clear_active_zone(device, physical); } @@ -1987,6 +2007,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* For active_bg_list */ btrfs_put_block_group(block_group); + clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + wake_up_all(&fs_info->zone_finish_wait); + return 0; } @@ -2023,6 +2046,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) } mutex_unlock(&fs_info->chunk_mutex); + if (!ret) + set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + return ret; } @@ -2139,3 +2165,123 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) factor = div64_u64(used * 100, total); return factor >= fs_info->bg_reclaim_threshold; } + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!block_group->zoned_data_reloc_ongoing) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + block_group->zoned_data_reloc_ongoing = 0; + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} + +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_block_group *min_bg = NULL; + u64 min_avail = U64_MAX; + int ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, + active_bg_list) { + u64 avail; + + spin_lock(&block_group->lock); + if (block_group->reserved || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + spin_unlock(&block_group->lock); + continue; + } + + avail = block_group->zone_capacity - block_group->alloc_offset; + if (min_avail > avail) { + if (min_bg) + btrfs_put_block_group(min_bg); + min_bg = block_group; + min_avail = avail; + btrfs_get_block_group(min_bg); + } + spin_unlock(&block_group->lock); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + + if (!min_bg) + return 0; + + ret = btrfs_zone_finish(min_bg); + btrfs_put_block_group(min_bg); + + return ret < 0 ? ret : 1; +} + +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + struct btrfs_block_group *bg; + int index; + + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + /* No more block groups to activate */ + if (space_info->active_total_bytes == space_info->total_bytes) + return 0; + + for (;;) { + int ret; + bool need_finish = false; + + down_read(&space_info->groups_sem); + for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { + list_for_each_entry(bg, &space_info->block_groups[index], + list) { + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + + if (btrfs_zone_activate(bg)) { + up_read(&space_info->groups_sem); + return 1; + } + + need_finish = true; + } + } + up_read(&space_info->groups_sem); + + if (!do_finish || !need_finish) + break; + + ret = btrfs_zone_finish_one_bg(fs_info); + if (ret == 0) + break; + if (ret < 0) + return ret; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index bb1a189e11f9..e17462db3a84 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -19,6 +19,7 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; + u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -77,6 +78,11 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, bool do_finish); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -243,6 +249,23 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) { return false; } + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + return 1; +} + +static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + /* Consider all the block groups are active */ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0fe31a6f6e68..35a0224d4eb7 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -403,7 +403,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -415,7 +415,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -450,9 +450,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -462,7 +460,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -477,13 +475,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; - kunmap(in_page); + kunmap_local(workspace->in_buf.src); put_page(in_page); - start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -510,9 +507,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -522,7 +517,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -537,13 +532,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = tot_out; out: *out_pages = nr_pages; - /* Cleanup */ - if (in_page) { - kunmap(in_page); + if (workspace->in_buf.src) { + kunmap_local(workspace->in_buf.src); put_page(in_page); } - if (out_page) - kunmap(out_page); return ret; } @@ -567,7 +559,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -603,14 +595,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - kunmap(pages_in[page_in_index++]); + kunmap_local(workspace->in_buf.src); + page_in_index++; if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -619,7 +612,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) zero_fill_bio(cb->orig_bio); done: if (workspace->in_buf.src) - kunmap(pages_in[page_in_index]); + kunmap_local(workspace->in_buf.src); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 898c7f301b1b..55e762a58eb6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -52,7 +52,7 @@ #include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, +static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -282,10 +282,10 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* - * If none of the buffers had errors and they are all - * uptodate then we can set the page uptodate. + * If all of the buffers are uptodate then we can set the page + * uptodate. */ - if (page_uptodate && !PageError(page)) + if (page_uptodate) SetPageUptodate(page); unlock_page(page); return; @@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev, struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) - ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); + ll_rw_block(REQ_OP_WRITE, 1, &bh); put_bh(bh); } } @@ -1174,7 +1174,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) } else { get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1342,7 +1342,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); brelse(bh); } } @@ -1353,7 +1353,7 @@ void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, { struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); if (likely(bh)) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); brelse(bh); } } @@ -1604,7 +1604,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) { struct inode *bd_inode = bdev->bd_inode; struct address_space *bd_mapping = bd_inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); pgoff_t end; int i, count; @@ -1612,24 +1612,24 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); - pagevec_init(&pvec); - while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { - count = pagevec_count(&pvec); + folio_batch_init(&fbatch); + while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) { + count = folio_batch_count(&fbatch); for (i = 0; i < count; i++) { - struct page *page = pvec.pages[i]; + struct folio *folio = fbatch.folios[i]; - if (!page_has_buffers(page)) + if (!folio_buffers(folio)) continue; /* - * We use page lock instead of bd_mapping->private_lock + * We use folio lock instead of bd_mapping->private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ - lock_page(page); - /* Recheck when the page is locked which pins bhs */ - if (!page_has_buffers(page)) + folio_lock(folio); + /* Recheck when the folio is locked which pins bhs */ + head = folio_buffers(folio); + if (!head) goto unlock_page; - head = page_buffers(page); bh = head; do { if (!buffer_mapped(bh) || (bh->b_blocknr < block)) @@ -1643,9 +1643,9 @@ next: bh = bh->b_this_page; } while (bh != head); unlock_page: - unlock_page(page); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); /* End of range already reached? */ if (index > end || !index) @@ -1716,7 +1716,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_flags = wbc_to_write_flags(wbc); + blk_opf_t write_flags = wbc_to_write_flags(wbc); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1804,7 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); nr_underway++; } bh = next; @@ -1858,7 +1858,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); nr_underway++; } bh = next; @@ -2033,7 +2033,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); *wait_bh++=bh; } } @@ -2259,6 +2259,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; + bool page_error = false; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); @@ -2283,8 +2284,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); - if (err) + if (err) { folio_set_error(folio); + page_error = true; + } } if (!buffer_mapped(bh)) { folio_zero_range(folio, i * blocksize, @@ -2311,7 +2314,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) * All buffers are uptodate - we can set the folio uptodate * as well. But not if get_block() returned an error. */ - if (!folio_test_error(folio)) + if (!page_error) folio_mark_uptodate(folio); folio_unlock(folio); return 0; @@ -2334,7 +2337,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (buffer_uptodate(bh)) end_buffer_async_read(bh, 1); else - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); } return 0; } @@ -2534,330 +2537,6 @@ out_unlock: } EXPORT_SYMBOL(block_page_mkwrite); -/* - * nobh_write_begin()'s prereads are special: the buffer_heads are freed - * immediately, while under the page lock. So it needs a special end_io - * handler which does not touch the bh after unlocking it. - */ -static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) -{ - __end_buffer_read_notouch(bh, uptodate); -} - -/* - * Attach the singly-linked list of buffers created by nobh_write_begin, to - * the page (converting it to circular linked list and taking care of page - * dirty races). - */ -static void attach_nobh_buffers(struct page *page, struct buffer_head *head) -{ - struct buffer_head *bh; - - BUG_ON(!PageLocked(page)); - - spin_lock(&page->mapping->private_lock); - bh = head; - do { - if (PageDirty(page)) - set_buffer_dirty(bh); - if (!bh->b_this_page) - bh->b_this_page = head; - bh = bh->b_this_page; - } while (bh != head); - attach_page_private(page, head); - spin_unlock(&page->mapping->private_lock); -} - -/* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) - * The filesystem needs to handle block truncation upon failure. - */ -int nobh_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata, - get_block_t *get_block) -{ - struct inode *inode = mapping->host; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; - struct buffer_head *head, *bh; - struct page *page; - pgoff_t index; - unsigned from, to; - unsigned block_in_page; - unsigned block_start, block_end; - sector_t block_in_file; - int nr_reads = 0; - int ret = 0; - int is_mapped_to_disk = 1; - - index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; - - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; - *fsdata = NULL; - - if (page_has_buffers(page)) { - ret = __block_write_begin(page, pos, len, get_block); - if (unlikely(ret)) - goto out_release; - return ret; - } - - if (PageMappedToDisk(page)) - return 0; - - /* - * Allocate buffers so that we can keep track of state, and potentially - * attach them to the page if an error occurs. In the common case of - * no error, they will just be freed again without ever being attached - * to the page (which is all OK, because we're under the page lock). - * - * Be careful: the buffer linked list is a NULL terminated one, rather - * than the circular one we're used to. - */ - head = alloc_page_buffers(page, blocksize, false); - if (!head) { - ret = -ENOMEM; - goto out_release; - } - - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); - - /* - * We loop across all blocks in the page, whether or not they are - * part of the affected region. This is so we can discover if the - * page is fully mapped-to-disk. - */ - for (block_start = 0, block_in_page = 0, bh = head; - block_start < PAGE_SIZE; - block_in_page++, block_start += blocksize, bh = bh->b_this_page) { - int create; - - block_end = block_start + blocksize; - bh->b_state = 0; - create = 1; - if (block_start >= to) - create = 0; - ret = get_block(inode, block_in_file + block_in_page, - bh, create); - if (ret) - goto failed; - if (!buffer_mapped(bh)) - is_mapped_to_disk = 0; - if (buffer_new(bh)) - clean_bdev_bh_alias(bh); - if (PageUptodate(page)) { - set_buffer_uptodate(bh); - continue; - } - if (buffer_new(bh) || !buffer_mapped(bh)) { - zero_user_segments(page, block_start, from, - to, block_end); - continue; - } - if (buffer_uptodate(bh)) - continue; /* reiserfs does this */ - if (block_start < from || block_end > to) { - lock_buffer(bh); - bh->b_end_io = end_buffer_read_nobh; - submit_bh(REQ_OP_READ, 0, bh); - nr_reads++; - } - } - - if (nr_reads) { - /* - * The page is locked, so these buffers are protected from - * any VM or truncate activity. Hence we don't need to care - * for the buffer_head refcounts. - */ - for (bh = head; bh; bh = bh->b_this_page) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - ret = -EIO; - } - if (ret) - goto failed; - } - - if (is_mapped_to_disk) - SetPageMappedToDisk(page); - - *fsdata = head; /* to be released by nobh_write_end */ - - return 0; - -failed: - BUG_ON(!ret); - /* - * Error recovery is a bit difficult. We need to zero out blocks that - * were newly allocated, and dirty them to ensure they get written out. - * Buffers need to be attached to the page at this point, otherwise - * the handling of potential IO errors during writeout would be hard - * (could try doing synchronous writeout, but what if that fails too?) - */ - attach_nobh_buffers(page, head); - page_zero_new_buffers(page, from, to); - -out_release: - unlock_page(page); - put_page(page); - *pagep = NULL; - - return ret; -} -EXPORT_SYMBOL(nobh_write_begin); - -int nobh_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct buffer_head *head = fsdata; - struct buffer_head *bh; - BUG_ON(fsdata != NULL && page_has_buffers(page)); - - if (unlikely(copied < len) && head) - attach_nobh_buffers(page, head); - if (page_has_buffers(page)) - return generic_write_end(file, mapping, pos, len, - copied, page, fsdata); - - SetPageUptodate(page); - set_page_dirty(page); - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - unlock_page(page); - put_page(page); - - while (head) { - bh = head; - head = head->b_this_page; - free_buffer_head(bh); - } - - return copied; -} -EXPORT_SYMBOL(nobh_write_end); - -/* - * nobh_writepage() - based on block_full_write_page() except - * that it tries to operate without attaching bufferheads to - * the page. - */ -int nobh_writepage(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - struct inode * const inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; - int ret; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - goto out; - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - unlock_page(page); - return 0; /* don't care */ - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_SIZE); -out: - ret = mpage_writepage(page, get_block, wbc); - if (ret == -EAGAIN) - ret = __block_write_full_page(inode, page, get_block, wbc, - end_buffer_async_write); - return ret; -} -EXPORT_SYMBOL(nobh_writepage); - -int nobh_truncate_page(struct address_space *mapping, - loff_t from, get_block_t *get_block) -{ - pgoff_t index = from >> PAGE_SHIFT; - struct inode *inode = mapping->host; - unsigned blocksize = i_blocksize(inode); - struct folio *folio; - struct buffer_head map_bh; - size_t offset; - sector_t iblock; - int err; - - /* Block boundary? Nothing to do */ - if (!(from & (blocksize - 1))) - return 0; - - folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_CREAT, - mapping_gfp_mask(mapping)); - err = -ENOMEM; - if (!folio) - goto out; - - if (folio_buffers(folio)) - goto has_buffers; - - iblock = from >> inode->i_blkbits; - map_bh.b_size = blocksize; - map_bh.b_state = 0; - err = get_block(inode, iblock, &map_bh, 0); - if (err) - goto unlock; - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(&map_bh)) - goto unlock; - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (!folio_test_uptodate(folio)) { - err = mapping->a_ops->read_folio(NULL, folio); - if (err) { - folio_put(folio); - goto out; - } - folio_lock(folio); - if (!folio_test_uptodate(folio)) { - err = -EIO; - goto unlock; - } - if (folio_buffers(folio)) - goto has_buffers; - } - offset = offset_in_folio(folio, from); - folio_zero_segment(folio, offset, round_up(offset, blocksize)); - folio_mark_dirty(folio); - err = 0; - -unlock: - folio_unlock(folio); - folio_put(folio); -out: - return err; - -has_buffers: - folio_unlock(folio); - folio_put(folio); - return block_truncate_page(mapping, from, get_block); -} -EXPORT_SYMBOL(nobh_truncate_page); - int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { @@ -2915,7 +2594,7 @@ int block_truncate_page(struct address_space *mapping, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { err = -EIO; - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) @@ -2994,9 +2673,10 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } -static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, +static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, struct writeback_control *wbc) { + const enum req_op op = opf & REQ_OP_MASK; struct bio *bio; BUG_ON(!buffer_locked(bh)); @@ -3012,11 +2692,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, clear_buffer_write_io_error(bh); if (buffer_meta(bh)) - op_flags |= REQ_META; + opf |= REQ_META; if (buffer_prio(bh)) - op_flags |= REQ_PRIO; + opf |= REQ_PRIO; - bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO); + bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); @@ -3040,22 +2720,21 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, return 0; } -int submit_bh(int op, int op_flags, struct buffer_head *bh) +int submit_bh(blk_opf_t opf, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, NULL); + return submit_bh_wbc(opf, bh, NULL); } EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) - * @op: whether to %READ or %WRITE - * @op_flags: req_flag_bits + * @opf: block layer request operation and flags. * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * * ll_rw_block() takes an array of pointers to &struct buffer_heads, and * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. - * @op_flags contains flags modifying the detailed I/O behavior, most notably + * @opf contains flags modifying the detailed I/O behavior, most notably * %REQ_RAHEAD. * * This function drops any buffer that it cannot get a lock on (with the @@ -3072,8 +2751,9 @@ EXPORT_SYMBOL(submit_bh); * All of the buffers must be for the same device, and must also be a * multiple of the current approved size for the device. */ -void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) +void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[]) { + const enum req_op op = opf & REQ_OP_MASK; int i; for (i = 0; i < nr; i++) { @@ -3081,18 +2761,18 @@ void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) if (!trylock_buffer(bh)) continue; - if (op == WRITE) { + if (op == REQ_OP_WRITE) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(op, op_flags, bh); + submit_bh(opf, bh); continue; } } else { if (!buffer_uptodate(bh)) { bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(op, op_flags, bh); + submit_bh(opf, bh); continue; } } @@ -3101,7 +2781,7 @@ void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) } EXPORT_SYMBOL(ll_rw_block); -void write_dirty_buffer(struct buffer_head *bh, int op_flags) +void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { lock_buffer(bh); if (!test_clear_buffer_dirty(bh)) { @@ -3110,7 +2790,7 @@ void write_dirty_buffer(struct buffer_head *bh, int op_flags) } bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(REQ_OP_WRITE, op_flags, bh); + submit_bh(REQ_OP_WRITE | op_flags, bh); } EXPORT_SYMBOL(write_dirty_buffer); @@ -3119,7 +2799,7 @@ EXPORT_SYMBOL(write_dirty_buffer); * and then start new I/O and then wait upon it. The caller must have a ref on * the buffer_head. */ -int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) +int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { int ret = 0; @@ -3137,7 +2817,7 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE, op_flags, bh); + ret = submit_bh(REQ_OP_WRITE | op_flags, bh); wait_on_buffer(bh); if (!ret && !buffer_uptodate(bh)) ret = -EIO; @@ -3365,7 +3045,7 @@ int bh_submit_read(struct buffer_head *bh) get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index a41ae6efc545..1fee702d5529 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -21,7 +21,8 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, * anon_fd. */ xas_for_each(&xas, req, ULONG_MAX) { - if (req->msg.opcode == CACHEFILES_OP_READ) { + if (req->msg.object_id == object_id && + req->msg.opcode == CACHEFILES_OP_READ) { req->error = -EIO; complete(&req->done); xas_store(&xas, NULL); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e5221be6eb55..d6e5916138e4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -63,7 +63,7 @@ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { @@ -394,11 +394,10 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) return 0; } -static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) +static void ceph_netfs_free_request(struct netfs_io_request *rreq) { - struct inode *inode = mapping->host; - struct ceph_inode_info *ci = ceph_inode(inode); - int got = (uintptr_t)priv; + struct ceph_inode_info *ci = ceph_inode(rreq->inode); + int got = (uintptr_t)rreq->netfs_priv; if (got) ceph_put_cap_refs(ci, got); @@ -406,12 +405,12 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, + .free_request = ceph_netfs_free_request, .begin_cache_operation = ceph_begin_cache_operation, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, - .cleanup = ceph_readahead_cleanup, }; #ifdef CONFIG_CEPH_FSCACHE @@ -1289,18 +1288,19 @@ ceph_find_incompatible(struct page *page) } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(folio, 0)); + snapc = ceph_find_incompatible(folio_page(*foliop, 0)); if (snapc) { int r; - folio_unlock(folio); - folio_put(folio); + folio_unlock(*foliop); + folio_put(*foliop); + *foliop = NULL; if (IS_ERR(snapc)) return PTR_ERR(snapc); @@ -1322,10 +1322,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; int r; - r = netfs_write_begin(file, inode->i_mapping, pos, len, &folio, NULL); + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); if (r == 0) folio_wait_fscache(folio); if (r < 0) { @@ -1798,7 +1799,7 @@ enum { static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool, struct ceph_string *pool_ns) { - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; struct rb_node **p, *parent; @@ -1913,7 +1914,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, 0, false, true); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - wr_req->r_mtime = ci->vfs_inode.i_mtime; + wr_req->r_mtime = ci->netfs.inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index ddea99922073..177d8e8d73fe 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -29,9 +29,9 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) if (!(inode->i_state & I_NEW)) return; - WARN_ON_ONCE(ci->netfs_ctx.cache); + WARN_ON_ONCE(ci->netfs.cache); - ci->netfs_ctx.cache = + ci->netfs.cache = fscache_acquire_cookie(fsc->fscache, 0, &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 7255b790a4c1..dc502daac49a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -28,7 +28,7 @@ void ceph_fscache_invalidate(struct inode *inode, bool dio_write); static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - return netfs_i_cookie(&ci->vfs_inode); + return netfs_i_cookie(&ci->netfs); } static inline void ceph_fscache_resize(struct inode *inode, loff_t to) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bf2e94005598..ac8fd5e7f540 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); - dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, + dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode, ci->i_hold_caps_max - jiffies); } @@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode, ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); @@ -531,7 +531,7 @@ no_change: static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) @@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + dout("__cap_delay_cancel %p\n", &ci->netfs.inode); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); @@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if (S_ISREG(ci->vfs_inode.i_mode) && + if (S_ISREG(ci->netfs.inode.i_mode) && (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; @@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); + if (S_ISDIR(ci->netfs.inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->netfs.inode); __ceph_dir_clear_complete(ci); } } /* Wipe saved layout if we're losing DIR_CREATE caps */ - if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && + if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && !(issued & CEPH_CAP_DIR_CREATE)) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); @@ -771,7 +771,7 @@ static int __cap_is_valid(struct ceph_cap *cap) if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " - "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode, cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } @@ -797,7 +797,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (!__cap_is_valid(cap)) continue; dout("__ceph_caps_issued %p cap %p issued %s\n", - &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + &ci->netfs.inode, cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; @@ -844,12 +844,12 @@ static void __touch_cap(struct ceph_cap *cap) spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { - dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", - &cap->ci->vfs_inode, cap, s->s_mds); + &cap->ci->netfs.inode, cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } @@ -867,7 +867,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if ((have & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), + " (mask %s)\n", ceph_ino(&ci->netfs.inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; @@ -879,7 +879,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) continue; if ((cap->issued & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, + " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) @@ -891,7 +891,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) have |= cap->issued; if ((have & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), + " (mask %s)\n", ceph_ino(&ci->netfs.inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { @@ -919,7 +919,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); @@ -950,7 +950,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int ret; spin_lock(&ci->i_ceph_lock); @@ -969,8 +969,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci) if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || - (S_ISREG(ci->vfs_inode.i_mode) && - ci->vfs_inode.i_data.nrpages)) + (S_ISREG(ci->netfs.inode.i_mode) && + ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -993,11 +993,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->vfs_inode)->mount_options; + ceph_inode_to_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; - if (S_ISDIR(ci->vfs_inode.i_mode)) { + if (S_ISDIR(ci->netfs.inode.i_mode)) { int want = 0; /* use used_cutoff here, to keep dir's wanted caps longer */ @@ -1050,7 +1050,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); - if (S_ISDIR(ci->vfs_inode.i_mode)) { + if (S_ISDIR(ci->netfs.inode.i_mode)) { /* we want EXCL if holding caps of dir ops */ if (w & CEPH_CAP_ANY_DIR_OPS) w |= CEPH_CAP_FILE_EXCL; @@ -1116,9 +1116,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode); - mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; + mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); @@ -1169,7 +1169,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) - ceph_change_snap_realm(&ci->vfs_inode, NULL); + ceph_change_snap_realm(&ci->netfs.inode, NULL); __cap_delay_cancel(mdsc, ci); } @@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - fsc = ceph_inode_to_client(&ci->vfs_inode); + fsc = ceph_inode_to_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && - !ceph_inode_is_shutdown(&ci->vfs_inode)); + !ceph_inode_is_shutdown(&ci->netfs.inode)); __ceph_remove_cap(cap, queue_release); } @@ -1343,7 +1343,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); @@ -1440,7 +1440,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); if (!msg) { @@ -1528,7 +1528,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; @@ -1622,7 +1622,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; int mds; @@ -1682,8 +1682,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct inode *inode = &ci->vfs_inode; + ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc; + struct inode *inode = &ci->netfs.inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1696,7 +1696,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, return 0; } - dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode, ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; @@ -1712,7 +1712,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, ci->i_snap_realm->cached_context); } dout(" inode %p now dirty snapc %p auth cap %p\n", - &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); @@ -1875,7 +1875,7 @@ static int try_nonblocking_invalidate(struct inode *inode) bool __ceph_should_report_size(struct ceph_inode_info *ci) { - loff_t size = i_size_read(&ci->vfs_inode); + loff_t size = i_size_read(&ci->netfs.inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; @@ -1900,7 +1900,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; @@ -2467,7 +2467,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; @@ -2560,7 +2560,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err("%p auth cap %p not mds%d ???\n", - &ci->vfs_inode, cap, session->s_mds); + &ci->netfs.inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2610,7 +2610,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err("%p auth cap %p not mds%d ???\n", - &ci->vfs_inode, cap, session->s_mds); + &ci->netfs.inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2630,7 +2630,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, lockdep_assert_held(&ci->i_ceph_lock); - dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, + dout("%s %p flushing %s\n", __func__, &ci->netfs.inode, ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { @@ -2673,10 +2673,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) - ihold(&ci->vfs_inode); + ihold(&ci->netfs.inode); ci->i_wb_ref++; dout("%s %p wb %d -> %d (?)\n", __func__, - &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); + &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -3004,7 +3004,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got return ret; } - if (S_ISREG(ci->vfs_inode.i_mode) && + if (S_ISREG(ci->netfs.inode.i_mode) && ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { @@ -3094,7 +3094,7 @@ enum put_cap_refs_mode { static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; @@ -3202,7 +3202,7 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; @@ -3698,7 +3698,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, session->s_mds, &list_first_entry(&session->s_cap_flushing, struct ceph_inode_info, - i_flushing_item)->vfs_inode); + i_flushing_item)->netfs.inode); } } mdsc->num_cap_flushing--; @@ -4345,7 +4345,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) break; list_del_init(&ci->i_cap_delay_list); - inode = igrab(&ci->vfs_inode); + inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); @@ -4373,10 +4373,11 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); + ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); iput(inode); spin_lock(&mdsc->cap_dirty_lock); @@ -4407,7 +4408,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci, void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool already_opened = false; int i; @@ -4441,7 +4442,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool is_closed = true; int i; @@ -4656,7 +4657,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali lockdep_assert_held(&ci->i_ceph_lock); dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); + cap, ci, &ci->netfs.inode); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c8226c0feac..da59e836a06e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -205,7 +205,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->vfs_inode)->mount_options; + ceph_inode_to_client(&ci->netfs.inode)->mount_options; struct ceph_file_info *fi; int ret; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b7e9cac3aeef..56c53ab3618e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -176,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, rb_insert_color(&frag->node, &ci->i_fragtree); dout("get_or_create_frag added %llx.%llx frag %x\n", - ceph_vinop(&ci->vfs_inode), f); + ceph_vinop(&ci->netfs.inode), f); return frag; } @@ -457,10 +457,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) if (!ci) return NULL; - dout("alloc_inode %p\n", &ci->vfs_inode); + dout("alloc_inode %p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ - netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops); + netfs_inode_init(&ci->netfs, &ceph_netfs_ops); spin_lock_init(&ci->i_ceph_lock); @@ -547,7 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); - return &ci->vfs_inode; + return &ci->netfs.inode; } void ceph_free_inode(struct inode *inode) @@ -1978,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work) { struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_work); - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { dout("writeback %p\n", inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f5d110d90b77..33f517d549ce 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1564,7 +1564,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, p = session->s_caps.next; while (p != &session->s_caps) { cap = list_entry(p, struct ceph_cap, session_caps); - inode = igrab(&cap->ci->vfs_inode); + inode = igrab(&cap->ci->netfs.inode); if (!inode) { p = p->next; continue; @@ -1622,7 +1622,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, int iputs; dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); + cap, ci, &ci->netfs.inode); spin_lock(&ci->i_ceph_lock); iputs = ceph_purge_inode_cap(inode, cap, &invalidate); spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 322ee5add942..864cdaa0d2bd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -521,7 +521,7 @@ static bool has_new_snaps(struct ceph_snap_context *o, static void ceph_queue_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap **pcapsnap) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_snap_context *old_snapc, *new_snapc; struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; @@ -652,7 +652,7 @@ update_snapc: int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); @@ -712,7 +712,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); + struct inode *inode = igrab(&ci->netfs.inode); if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); @@ -904,7 +904,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); ceph_flush_snaps(ci, &session); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b73b4f75462c..40140805bdcf 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -876,7 +876,7 @@ mempool_t *ceph_wb_pagevec_pool; static void ceph_inode_init_once(void *foo) { struct ceph_inode_info *ci = foo; - inode_init_once(&ci->vfs_inode); + inode_init_once(&ci->netfs.inode); } static int __init init_caches(void) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dd7dac0f984a..f59dac66955b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -316,11 +316,7 @@ struct ceph_inode_xattrs_info { * Ceph inode. */ struct ceph_inode_info { - struct { - /* These must be contiguous */ - struct inode vfs_inode; - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; @@ -436,7 +432,7 @@ struct ceph_inode_info { static inline struct ceph_inode_info * ceph_inode(const struct inode *inode) { - return container_of(inode, struct ceph_inode_info, vfs_inode); + return container_of(inode, struct ceph_inode_info, netfs.inode); } static inline struct ceph_fs_client * @@ -1316,7 +1312,7 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) - ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); + ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } extern void ceph_handle_quota(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 8c2dc2c762a4..f141f5246163 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_string *pool_ns; s64 pool = ci->i_layout.pool_id; @@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); - dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode); down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { @@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, char *val, size_t size) { ssize_t ret; - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ci->i_layout.pool_id; const char *pool_name; @@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid); } @@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "client%lld", ceph_client_gid(fsc->client)); @@ -629,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci, } dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", - ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val); + ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val); return 0; } @@ -871,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) struct ceph_buffer *old_blob = NULL; void *dest; - dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + dout("__build_xattrs_blob %p\n", &ci->netfs.inode); if (ci->i_xattrs.dirty) { int need = __get_required_blob_size(ci, 0, 0); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 1dd995efd5b8..2cfbac8bb965 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -162,6 +162,8 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); else if (iface->sockaddr.ss_family == AF_INET6) seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); + if (!iface->is_active) + seq_puts(m, "\t\t[for-cleanup]\n"); } static int cifs_debug_files_proc_show(struct seq_file *m, void *v) @@ -221,6 +223,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; + struct cifs_server_iface *iface; int c, i, j; seq_puts(m, @@ -456,11 +459,10 @@ skip_rdma: if (ses->iface_count) seq_printf(m, "\n\n\tServer interfaces: %zu", ses->iface_count); - for (j = 0; j < ses->iface_count; j++) { - struct cifs_server_iface *iface; - - iface = &ses->iface_list[j]; - seq_printf(m, "\n\t%d)", j+1); + j = 0; + list_for_each_entry(iface, &ses->iface_list, + iface_head) { + seq_printf(m, "\n\t%d)", ++j); cifs_dump_iface(m, iface); if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 12c872800326..8f2e003e0590 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -377,7 +377,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->flags = 0; spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; - cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ + cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; @@ -389,12 +389,12 @@ cifs_alloc_inode(struct super_block *sb) * Can not set i_flags here - they get immediately overwritten to zero * by the VFS. */ - /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ + /* cifs_inode->netfs.inode.i_flags = S_NOATIME | S_NOCMTIME; */ INIT_LIST_HEAD(&cifs_inode->openFileList); INIT_LIST_HEAD(&cifs_inode->llist); INIT_LIST_HEAD(&cifs_inode->deferred_closes); spin_lock_init(&cifs_inode->deferred_lock); - return &cifs_inode->vfs_inode; + return &cifs_inode->netfs.inode; } static void @@ -1086,7 +1086,7 @@ struct file_system_type cifs_fs_type = { }; MODULE_ALIAS_FS("cifs"); -static struct file_system_type smb3_fs_type = { +struct file_system_type smb3_fs_type = { .owner = THIS_MODULE, .name = "smb3", .init_fs_context = smb3_init_fs_context, @@ -1418,7 +1418,7 @@ cifs_init_once(void *inode) { struct cifsInodeInfo *cifsi = inode; - inode_init_once(&cifsi->vfs_inode); + inode_init_once(&cifsi->netfs.inode); init_rwsem(&cifsi->lock_sem); } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index dd7e070ca243..b17be47a8e59 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -38,7 +38,7 @@ static inline unsigned long cifs_get_time(struct dentry *dentry) return (unsigned long) dentry->d_fsdata; } -extern struct file_system_type cifs_fs_type; +extern struct file_system_type cifs_fs_type, smb3_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f873379066c7..a643c84ff1e9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -80,6 +80,9 @@ #define SMB_DNS_RESOLVE_INTERVAL_MIN 120 #define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600 +/* smb multichannel query server interfaces interval in seconds */ +#define SMB_INTERFACE_POLL_INTERVAL 600 + /* maximum number of PDUs in one compound */ #define MAX_COMPOUND 5 @@ -933,15 +936,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) #endif struct cifs_server_iface { + struct list_head iface_head; + struct kref refcount; size_t speed; unsigned int rdma_capable : 1; unsigned int rss_capable : 1; + unsigned int is_active : 1; /* unset if non existent */ struct sockaddr_storage sockaddr; }; +/* release iface when last ref is dropped */ +static inline void +release_iface(struct kref *ref) +{ + struct cifs_server_iface *iface = container_of(ref, + struct cifs_server_iface, + refcount); + list_del_init(&iface->iface_head); + kfree(iface); +} + +/* + * compare two interfaces a and b + * return 0 if everything matches. + * return 1 if a has higher link speed, or rdma capable, or rss capable + * return -1 otherwise. + */ +static inline int +iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) +{ + int cmp_ret = 0; + + WARN_ON(!a || !b); + if (a->speed == b->speed) { + if (a->rdma_capable == b->rdma_capable) { + if (a->rss_capable == b->rss_capable) { + cmp_ret = memcmp(&a->sockaddr, &b->sockaddr, + sizeof(a->sockaddr)); + if (!cmp_ret) + return 0; + else if (cmp_ret > 0) + return 1; + else + return -1; + } else if (a->rss_capable > b->rss_capable) + return 1; + else + return -1; + } else if (a->rdma_capable > b->rdma_capable) + return 1; + else + return -1; + } else if (a->speed > b->speed) + return 1; + else + return -1; +} + struct cifs_chan { unsigned int in_reconnect : 1; /* if session setup in progress for this channel */ struct TCP_Server_Info *server; + struct cifs_server_iface *iface; /* interface in use */ __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; @@ -993,7 +1048,7 @@ struct cifs_ses { */ spinlock_t iface_lock; /* ========= begin: protected by iface_lock ======== */ - struct cifs_server_iface *iface_list; + struct list_head iface_list; size_t iface_count; unsigned long iface_last_update; /* jiffies */ /* ========= end: protected by iface_lock ======== */ @@ -1203,6 +1258,7 @@ struct cifs_tcon { #ifdef CONFIG_CIFS_DFS_UPCALL struct list_head ulist; /* cache update list */ #endif + struct delayed_work query_interfaces; /* query interfaces workqueue job */ }; /* @@ -1479,20 +1535,16 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG) #define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)) +#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)) #define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)) +#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)) /* * One of these for each file inode */ struct cifsInodeInfo { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ bool can_cache_brlcks; struct list_head llist; /* locks helb by this inode */ /* @@ -1531,7 +1583,7 @@ struct cifsInodeInfo { static inline struct cifsInodeInfo * CIFS_I(struct inode *inode) { - return container_of(inode, struct cifsInodeInfo, vfs_inode); + return container_of(inode, struct cifsInodeInfo, netfs.inode); } static inline struct cifs_sb_info * diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3b7366ec03c7..d59aebefa71c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -636,6 +636,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses, bool cifs_chan_needs_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, + struct TCP_Server_Info *server); +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); +int +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d46702f5a663..386bb523c69e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,6 +97,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) if (!server->hostname) return -EINVAL; + /* if server hostname isn't populated, there's nothing to do here */ + if (server->hostname[0] == '\0') + return 0; + len = strlen(server->hostname) + 3; unc = kmalloc(len, GFP_KERNEL); @@ -141,6 +145,25 @@ requeue_resolve: return rc; } +static void smb2_query_server_interfaces(struct work_struct *work) +{ + int rc; + struct cifs_tcon *tcon = container_of(work, + struct cifs_tcon, + query_interfaces.work); + + /* + * query server network interfaces, in case they change + */ + rc = SMB3_request_interfaces(0, tcon); + if (rc) { + cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", + __func__, rc); + } + + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); +} static void cifs_resolve_server(struct work_struct *work) { @@ -213,7 +236,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { struct TCP_Server_Info *pserver; - struct cifs_ses *ses; + struct cifs_ses *ses, *nses; struct cifs_tcon *tcon; /* @@ -227,7 +250,20 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { + /* check if iface is still active */ + if (!cifs_chan_is_iface_active(ses, server)) { + /* + * HACK: drop the lock before calling + * cifs_chan_update_iface to avoid deadlock + */ + ses->ses_count++; + spin_unlock(&cifs_tcp_ses_lock); + cifs_chan_update_iface(ses, server); + spin_lock(&cifs_tcp_ses_lock); + ses->ses_count--; + } + spin_lock(&ses->chan_lock); if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) goto next_session; @@ -1882,7 +1918,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses) list_del_init(&ses->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); - spin_lock(&ses->chan_lock); chan_count = ses->chan_count; /* close any extra channels */ @@ -1890,13 +1925,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses) int i; for (i = 1; i < chan_count; i++) { - spin_unlock(&ses->chan_lock); + if (ses->chans[i].iface) { + kref_put(&ses->chans[i].iface->refcount, release_iface); + ses->chans[i].iface = NULL; + } cifs_put_tcp_session(ses->chans[i].server, 0); - spin_lock(&ses->chan_lock); ses->chans[i].server = NULL; } } - spin_unlock(&ses->chan_lock); sesInfoFree(ses); cifs_put_tcp_session(server, 0); @@ -2266,6 +2302,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) list_del_init(&tcon->tcon_list); spin_unlock(&cifs_tcp_ses_lock); + /* cancel polling of interfaces */ + cancel_delayed_work_sync(&tcon->query_interfaces); + if (tcon->use_witness) { int rc; @@ -2503,6 +2542,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); + /* schedule query interfaces poll */ + INIT_DELAYED_WORK(&tcon->query_interfaces, + smb2_query_server_interfaces); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3978,10 +4023,16 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; bool is_binding = false; - spin_lock(&cifs_tcp_ses_lock); + if (server->dstaddr.ss_family == AF_INET6) + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); + else + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); + if (ses->ses_status != SES_GOOD && ses->ses_status != SES_NEW && ses->ses_status != SES_NEED_RECON) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1618e0537d58..e64cda7a7610 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2004,7 +2004,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { struct cifsFileInfo *open_file = NULL; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) @@ -2060,7 +2060,7 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, return rc; } - cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) @@ -4669,14 +4669,14 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) /* This inode is open for write at least once */ struct cifs_sb_info *cifs_sb; - cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb); + cifs_sb = CIFS_SB(cifsInode->netfs.inode.i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { /* since no page cache to corrupt on directio we can change size safely */ return true; } - if (i_size_read(&cifsInode->vfs_inode) < end_of_file) + if (i_size_read(&cifsInode->netfs.inode) < end_of_file) return true; return false; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a638b29e9062..23ef56f55ce5 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -101,13 +101,13 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); + cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd); - cifsi->netfs_ctx.cache = + cifsi->netfs.cache = fscache_acquire_cookie(tcon->fscache, 0, &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), - i_size_read(&cifsi->vfs_inode)); + i_size_read(&cifsi->netfs.inode)); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) @@ -131,7 +131,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) if (cookie) { cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie); fscache_relinquish_cookie(cookie, false); - cifsi->netfs_ctx.cache = NULL; + cifsi->netfs.cache = NULL; } } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 52355c0912ae..aa3b941a5555 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -52,16 +52,16 @@ void cifs_fscache_fill_coherency(struct inode *inode, struct cifsInodeInfo *cifsi = CIFS_I(inode); memset(cd, 0, sizeof(*cd)); - cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec); - cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec); - cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec); - cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec); + cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec); + cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec); } static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { - return netfs_i_cookie(inode); + return netfs_i_cookie(&CIFS_I(inode)->netfs); } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 2f9e7d2f81b6..81da81e18553 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -115,7 +115,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) __func__, cifs_i->uniqueid); set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); /* Invalidate fscache cookie */ - cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd); + cifs_fscache_fill_coherency(&cifs_i->netfs.inode, &cd); fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); } @@ -2499,7 +2499,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, u64 len) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->netfs.inode.i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct TCP_Server_Info *server = tcon->ses->server; struct cifsFileInfo *cfile; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 35962a1a23b9..0e84e6fcf8ab 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -75,6 +75,7 @@ sesInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->tcon_list); mutex_init(&ret_buf->session_mutex); spin_lock_init(&ret_buf->iface_lock); + INIT_LIST_HEAD(&ret_buf->iface_list); spin_lock_init(&ret_buf->chan_lock); } return ret_buf; @@ -83,6 +84,8 @@ sesInfoAlloc(void) void sesInfoFree(struct cifs_ses *buf_to_free) { + struct cifs_server_iface *iface = NULL, *niface = NULL; + if (buf_to_free == NULL) { cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n"); return; @@ -96,7 +99,11 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kfree_sensitive(buf_to_free->auth_key.response); - kfree(buf_to_free->iface_list); + spin_lock(&buf_to_free->iface_lock); + list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list, + iface_head) + kref_put(&iface->refcount, release_iface); + spin_unlock(&buf_to_free->iface_lock); kfree_sensitive(buf_to_free); } @@ -537,11 +544,11 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) if (oplock == OPLOCK_EXCLUSIVE) { cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == OPLOCK_READ) { cinode->oplock = CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else cinode->oplock = 0; } @@ -1211,18 +1218,23 @@ static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void .data = data, .sb = NULL, }; + struct file_system_type **fs_type = (struct file_system_type *[]) { + &cifs_fs_type, &smb3_fs_type, NULL, + }; - iterate_supers_type(&cifs_fs_type, f, &sd); - - if (!sd.sb) - return ERR_PTR(-EINVAL); - /* - * Grab an active reference in order to prevent automounts (DFS links) - * of expiring and then freeing up our cifs superblock pointer while - * we're doing failover. - */ - cifs_sb_active(sd.sb); - return sd.sb; + for (; *fs_type; fs_type++) { + iterate_supers_type(*fs_type, f, &sd); + if (sd.sb) { + /* + * Grab an active reference in order to prevent automounts (DFS links) + * of expiring and then freeing up our cifs superblock pointer while + * we're doing failover. + */ + cifs_sb_active(sd.sb); + return sd.sb; + } + } + return ERR_PTR(-EINVAL); } static void __cifs_put_super(struct super_block *sb) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3b7915af1f62..02c8b2906196 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { - if (is_server_using_iface(ses->chans[i].server, iface)) { + if (ses->chans[i].iface == iface) { spin_unlock(&ses->chan_lock); return true; } @@ -81,6 +81,9 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, } /* If we didn't find the channel, it is likely a bug */ + if (server) + cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", + server->conn_id); WARN_ON(1); return 0; } @@ -143,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses, return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index); } +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + return ses->chans[chan_index].iface && + ses->chans[chan_index].iface->is_active; +} + /* returns number of channels added */ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { int old_chan_count, new_chan_count; int left; - int i = 0; int rc = 0; int tries = 0; - struct cifs_server_iface *ifaces = NULL; - size_t iface_count; + struct cifs_server_iface *iface = NULL, *niface = NULL; spin_lock(&ses->chan_lock); @@ -182,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) spin_unlock(&ses->chan_lock); /* - * Make a copy of the iface list at the time and use that - * instead so as to not hold the iface spinlock for opening - * channels - */ - spin_lock(&ses->iface_lock); - iface_count = ses->iface_count; - if (iface_count <= 0) { - spin_unlock(&ses->iface_lock); - cifs_dbg(VFS, "no iface list available to open channels\n"); - return 0; - } - ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces), - GFP_ATOMIC); - if (!ifaces) { - spin_unlock(&ses->iface_lock); - return 0; - } - spin_unlock(&ses->iface_lock); - - /* * Keep connecting to same, fastest, iface for all channels as * long as its RSS. Try next fastest one if not RSS or channel * creation fails. */ + spin_lock(&ses->iface_lock); + iface = list_first_entry(&ses->iface_list, struct cifs_server_iface, + iface_head); + spin_unlock(&ses->iface_lock); + while (left > 0) { - struct cifs_server_iface *iface; tries++; if (tries > 3*ses->chan_max) { @@ -216,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) break; } - iface = &ifaces[i]; - if (is_ses_using_iface(ses, iface) && !iface->rss_capable) { - i = (i+1) % iface_count; - continue; + spin_lock(&ses->iface_lock); + if (!ses->iface_count) { + spin_unlock(&ses->iface_lock); + break; } - rc = cifs_ses_add_channel(cifs_sb, ses, iface); - if (rc) { - cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n", - i, rc); - i = (i+1) % iface_count; - continue; + list_for_each_entry_safe_from(iface, niface, &ses->iface_list, + iface_head) { + /* skip ifaces that are unusable */ + if (!iface->is_active || + (is_ses_using_iface(ses, iface) && + !iface->rss_capable)) { + continue; + } + + /* take ref before unlock */ + kref_get(&iface->refcount); + + spin_unlock(&ses->iface_lock); + rc = cifs_ses_add_channel(cifs_sb, ses, iface); + spin_lock(&ses->iface_lock); + + if (rc) { + cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n", + &iface->sockaddr, + rc); + kref_put(&iface->refcount, release_iface); + continue; + } + + cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n", + &iface->sockaddr); + break; } + spin_unlock(&ses->iface_lock); - cifs_dbg(FYI, "successfully opened new channel on iface#%d\n", - i); left--; new_chan_count++; } - kfree(ifaces); return new_chan_count - old_chan_count; } /* + * update the iface for the channel if necessary. + * will return 0 when iface is updated, 1 if removed, 2 otherwise + * Must be called with chan_lock held. + */ +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) +{ + unsigned int chan_index; + struct cifs_server_iface *iface = NULL; + struct cifs_server_iface *old_iface = NULL; + int rc = 0; + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (!chan_index) { + spin_unlock(&ses->chan_lock); + return 0; + } + + if (ses->chans[chan_index].iface) { + old_iface = ses->chans[chan_index].iface; + if (old_iface->is_active) { + spin_unlock(&ses->chan_lock); + return 1; + } + } + spin_unlock(&ses->chan_lock); + + spin_lock(&ses->iface_lock); + /* then look for a new one */ + list_for_each_entry(iface, &ses->iface_list, iface_head) { + if (!iface->is_active || + (is_ses_using_iface(ses, iface) && + !iface->rss_capable)) { + continue; + } + kref_get(&iface->refcount); + } + + if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + rc = 1; + iface = NULL; + cifs_dbg(FYI, "unable to find a suitable iface\n"); + } + + /* now drop the ref to the current iface */ + if (old_iface && iface) { + kref_put(&old_iface->refcount, release_iface); + cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n", + &old_iface->sockaddr, + &iface->sockaddr); + } else if (old_iface) { + kref_put(&old_iface->refcount, release_iface); + cifs_dbg(FYI, "releasing ref to iface: %pIS\n", + &old_iface->sockaddr); + } else { + WARN_ON(!iface); + cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr); + } + spin_unlock(&ses->iface_lock); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + ses->chans[chan_index].iface = iface; + + /* No iface is found. if secondary chan, drop connection */ + if (!iface && CIFS_SERVER_IS_CHAN(server)) + ses->chans[chan_index].server = NULL; + + spin_unlock(&ses->chan_lock); + + if (!iface && CIFS_SERVER_IS_CHAN(server)) + cifs_put_tcp_session(server, false); + + return rc; +} + +/* * If server is a channel of ses, return the corresponding enclosing * cifs_chan otherwise return NULL. */ @@ -301,7 +393,10 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, /* Auth */ ctx.domainauto = ses->domainAuto; ctx.domainname = ses->domainName; - ctx.server_hostname = ses->server->hostname; + + /* no hostname for extra channels */ + ctx.server_hostname = ""; + ctx.username = ses->user_name; ctx.password = ses->password; ctx.sectype = ses->sectype; @@ -349,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, spin_unlock(&ses->chan_lock); goto out; } + chan->iface = iface; ses->chan_count++; atomic_set(&ses->chan_seq, 0); @@ -378,6 +474,14 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, out: if (rc && chan->server) { + /* + * we should avoid race with these delayed works before we + * remove this channel + */ + cancel_delayed_work_sync(&chan->server->echo); + cancel_delayed_work_sync(&chan->server->resolve); + cancel_delayed_work_sync(&chan->server->reconnect); + spin_lock(&ses->chan_lock); /* we rely on all bits beyond chan_count to be clear */ cifs_chan_clear_need_reconnect(ses, chan->server); @@ -388,10 +492,9 @@ out: */ WARN_ON(ses->chan_count < 1); spin_unlock(&ses->chan_lock); - } - if (rc && chan->server) cifs_put_tcp_session(chan->server, 0); + } return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 98a76fa791c0..8802995b2d3d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -512,73 +512,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, size_t buf_len, - struct cifs_server_iface **iface_list, - size_t *iface_count) + struct cifs_ses *ses) { struct network_interface_info_ioctl_rsp *p; struct sockaddr_in *addr4; struct sockaddr_in6 *addr6; struct iface_info_ipv4 *p4; struct iface_info_ipv6 *p6; - struct cifs_server_iface *info; + struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL; + struct cifs_server_iface tmp_iface; ssize_t bytes_left; size_t next = 0; int nb_iface = 0; - int rc = 0; - - *iface_list = NULL; - *iface_count = 0; - - /* - * Fist pass: count and sanity check - */ + int rc = 0, ret = 0; bytes_left = buf_len; p = buf; - while (bytes_left >= sizeof(*p)) { - nb_iface++; - next = le32_to_cpu(p->Next); - if (!next) { - bytes_left -= sizeof(*p); - break; - } - p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); - bytes_left -= next; - } - - if (!nb_iface) { - cifs_dbg(VFS, "%s: malformed interface info\n", __func__); - rc = -EINVAL; - goto out; - } - - /* Azure rounds the buffer size up 8, to a 16 byte boundary */ - if ((bytes_left > 8) || p->Next) - cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - + spin_lock(&ses->iface_lock); /* - * Second pass: extract info to internal structure + * Go through iface_list and do kref_put to remove + * any unused ifaces. ifaces in use will be removed + * when the last user calls a kref_put on it */ - - *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL); - if (!*iface_list) { - rc = -ENOMEM; - goto out; + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + iface->is_active = 0; + kref_put(&iface->refcount, release_iface); } + spin_unlock(&ses->iface_lock); - info = *iface_list; - bytes_left = buf_len; - p = buf; while (bytes_left >= sizeof(*p)) { - info->speed = le64_to_cpu(p->LinkSpeed); - info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; - info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; - - cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); - cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); - cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, - le32_to_cpu(p->Capability)); + memset(&tmp_iface, 0, sizeof(tmp_iface)); + tmp_iface.speed = le64_to_cpu(p->LinkSpeed); + tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; + tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; switch (p->Family) { /* @@ -587,7 +555,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, * conversion explicit in case either one changes. */ case INTERNETWORK: - addr4 = (struct sockaddr_in *)&info->sockaddr; + addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr; p4 = (struct iface_info_ipv4 *)p->Buffer; addr4->sin_family = AF_INET; memcpy(&addr4->sin_addr, &p4->IPv4Address, 4); @@ -599,7 +567,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, &addr4->sin_addr); break; case INTERNETWORKV6: - addr6 = (struct sockaddr_in6 *)&info->sockaddr; + addr6 = (struct sockaddr_in6 *)&tmp_iface.sockaddr; p6 = (struct iface_info_ipv6 *)p->Buffer; addr6->sin6_family = AF_INET6; memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16); @@ -619,46 +587,96 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, goto next_iface; } - (*iface_count)++; - info++; + /* + * The iface_list is assumed to be sorted by speed. + * Check if the new interface exists in that list. + * NEVER change iface. it could be in use. + * Add a new one instead + */ + spin_lock(&ses->iface_lock); + iface = niface = NULL; + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + ret = iface_cmp(iface, &tmp_iface); + if (!ret) { + /* just get a ref so that it doesn't get picked/freed */ + iface->is_active = 1; + kref_get(&iface->refcount); + spin_unlock(&ses->iface_lock); + goto next_iface; + } else if (ret < 0) { + /* all remaining ifaces are slower */ + kref_get(&iface->refcount); + break; + } + } + spin_unlock(&ses->iface_lock); + + /* no match. insert the entry in the list */ + info = kmalloc(sizeof(struct cifs_server_iface), + GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto out; + } + memcpy(info, &tmp_iface, sizeof(tmp_iface)); + + /* add this new entry to the list */ + kref_init(&info->refcount); + info->is_active = 1; + + cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count); + cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); + cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, + le32_to_cpu(p->Capability)); + + spin_lock(&ses->iface_lock); + if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + list_add_tail(&info->iface_head, &iface->iface_head); + kref_put(&iface->refcount, release_iface); + } else + list_add_tail(&info->iface_head, &ses->iface_list); + spin_unlock(&ses->iface_lock); + + ses->iface_count++; + ses->iface_last_update = jiffies; next_iface: + nb_iface++; next = le32_to_cpu(p->Next); - if (!next) + if (!next) { + bytes_left -= sizeof(*p); break; + } p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); bytes_left -= next; } - if (!*iface_count) { + if (!nb_iface) { + cifs_dbg(VFS, "%s: malformed interface info\n", __func__); rc = -EINVAL; goto out; } -out: - if (rc) { - kfree(*iface_list); - *iface_count = 0; - *iface_list = NULL; - } - return rc; -} + /* Azure rounds the buffer size up 8, to a 16 byte boundary */ + if ((bytes_left > 8) || p->Next) + cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); -static int compare_iface(const void *ia, const void *ib) -{ - const struct cifs_server_iface *a = (struct cifs_server_iface *)ia; - const struct cifs_server_iface *b = (struct cifs_server_iface *)ib; - return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1); + if (!ses->iface_count) { + rc = -EINVAL; + goto out; + } + +out: + return rc; } -static int +int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) { int rc; unsigned int ret_data_len = 0; struct network_interface_info_ioctl_rsp *out_buf = NULL; - struct cifs_server_iface *iface_list; - size_t iface_count; struct cifs_ses *ses = tcon->ses; rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, @@ -674,21 +692,10 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) goto out; } - rc = parse_server_interfaces(out_buf, ret_data_len, - &iface_list, &iface_count); + rc = parse_server_interfaces(out_buf, ret_data_len, ses); if (rc) goto out; - /* sort interfaces from fastest to slowest */ - sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL); - - spin_lock(&ses->iface_lock); - kfree(ses->iface_list); - ses->iface_list = iface_list; - ses->iface_count = iface_count; - ses->iface_last_update = jiffies; - spin_unlock(&ses->iface_lock); - out: kfree(out_buf); return rc; @@ -4260,15 +4267,15 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { cinode->oplock = CIFS_CACHE_RHW_FLG; cifs_dbg(FYI, "Batch Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { cinode->oplock = CIFS_CACHE_RW_FLG; cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == SMB2_OPLOCK_LEVEL_II) { cinode->oplock = CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else cinode->oplock = 0; } @@ -4307,7 +4314,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, cinode->oplock = new_oplock; cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, - &cinode->vfs_inode); + &cinode->netfs.inode); } static void diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0e8c85249579..c705de32e225 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -288,6 +288,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; goto failed; + } else if (rc) { + mutex_unlock(&ses->session_mutex); + goto out; } } else { mutex_unlock(&ses->session_mutex); @@ -540,6 +543,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, struct TCP_Server_Info *server, unsigned int *total_len) { char *pneg_ctxt; + char *hostname = NULL; unsigned int ctxt_len, neg_context_count; if (*total_len > 200) { @@ -567,16 +571,25 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, *total_len += ctxt_len; pneg_ctxt += ctxt_len; - ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, - server->hostname); - *total_len += ctxt_len; - pneg_ctxt += ctxt_len; + /* + * secondary channels don't have the hostname field populated + * use the hostname field in the primary channel instead + */ + hostname = CIFS_SERVER_IS_CHAN(server) ? + server->primary_server->hostname : server->hostname; + if (hostname && (hostname[0] != 0)) { + ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, + hostname); + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; + neg_context_count = 3; + } else + neg_context_count = 2; build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); *total_len += sizeof(struct smb2_posix_neg_context); pneg_ctxt += sizeof(struct smb2_posix_neg_context); - - neg_context_count = 4; + neg_context_count++; if (server->compress_algorithm) { build_compression_ctxt((struct smb2_compression_capabilities_context *) @@ -5151,6 +5164,8 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, data = &info; size = sizeof(struct smb2_file_eof_info); + trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof)); + return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, 1, &data, &size); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 2be5e0c8564d..6b88dc2e364f 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -121,6 +121,44 @@ DEFINE_SMB3_RW_DONE_EVENT(query_dir_done); DEFINE_SMB3_RW_DONE_EVENT(zero_done); DEFINE_SMB3_RW_DONE_EVENT(falloc_done); +/* For logging successful set EOF (truncate) */ +DECLARE_EVENT_CLASS(smb3_eof_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u64 offset), + TP_ARGS(xid, fid, tid, sesid, offset), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, offset) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->offset = offset; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->offset) +) + +#define DEFINE_SMB3_EOF_EVENT(name) \ +DEFINE_EVENT(smb3_eof_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 offset), \ + TP_ARGS(xid, fid, tid, sesid, offset)) + +DEFINE_SMB3_EOF_EVENT(set_eof); + /* * For handle based calls other than read and write, and get/set info */ diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index 8adf81042498..ccdbec388091 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -22,25 +22,24 @@ static int coda_symlink_filler(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct inode *inode = folio->mapping->host; int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; - char *p = page_address(page); + char *p = folio_address(folio); cii = ITOC(inode); error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); if (error) goto fail; - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_set_error(folio); + folio_unlock(folio); return error; } diff --git a/fs/coredump.c b/fs/coredump.c index ebc43f960b64..9f4aae202109 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -816,9 +816,9 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) { static char zeroes[PAGE_SIZE]; struct file *file = cprm->file; - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_mode & FMODE_LSEEK) { if (dump_interrupted() || - file->f_op->llseek(file, nr, SEEK_CUR) < 0) + vfs_llseek(file, nr, SEEK_CUR) < 0) return 0; cprm->pos += nr; return 1; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 7ae59a6afc5c..61ccf7722fc3 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -183,6 +183,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct file_ra_state ra; struct page *pages[BLKS_PER_BUF]; unsigned i, blocknr, buffer; unsigned long devsize; @@ -212,6 +213,9 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ + file_ra_state_init(&ra, mapping); + page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF); + for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = NULL; @@ -224,19 +228,6 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, pages[i] = page; } - for (i = 0; i < BLKS_PER_BUF; i++) { - struct page *page = pages[i]; - - if (page) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - /* asynchronous error */ - put_page(page); - pages[i] = NULL; - } - } - } - buffer = next_buffer; next_buffer = NEXT_BUFFER(buffer); buffer_blocknr[buffer] = blocknr; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 6b4c8094cc7b..f5be777d8279 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -31,7 +31,7 @@ #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ -#define FSCRYPT_MODE_MAX FSCRYPT_MODE_ADIANTUM +#define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index c35711896bd4..fbc71abdabe3 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -53,6 +53,13 @@ struct fscrypt_mode fscrypt_modes[] = { .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, + [FSCRYPT_MODE_AES_256_HCTR2] = { + .friendly_name = "AES-256-HCTR2", + .cipher_str = "hctr2(aes)", + .keysize = 32, + .security_strength = 32, + .ivsize = 32, + }, }; static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 5f858cee1e3b..8a054e6d1e68 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -61,7 +61,7 @@ fscrypt_get_dummy_policy(struct super_block *sb) return sb->s_cop->get_dummy_policy(sb); } -static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) +static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_CTS) @@ -78,6 +78,14 @@ static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) return false; } +static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode) +{ + if (contents_mode == FSCRYPT_MODE_AES_256_XTS && + filenames_mode == FSCRYPT_MODE_AES_256_HCTR2) + return true; + return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode); +} + static bool supported_direct_key_modes(const struct inode *inode, u32 contents_mode, u32 filenames_mode) { @@ -151,7 +159,7 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, const struct inode *inode) { - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", @@ -187,7 +195,7 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, { int count = 0; - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", @@ -1088,10 +1088,10 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) pos += size; length -= size; written += size; - if (did_zero) - *did_zero = true; } while (length > 0); + if (did_zero) + *did_zero = true; return written; } diff --git a/fs/dcache.c b/fs/dcache.c index 93f4f5ee07bf..ea5cdec24ea7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2240,6 +2240,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, } res = d_splice_alias(inode, found); if (res) { + d_lookup_done(found); dput(found); return res; } @@ -2563,7 +2564,15 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - + /* + * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT + * kernels spin_lock() implicitly disables preemption, but not on + * PREEMPT_RT. So for RT it has to be done explicitly to protect + * the sequence count write side critical section against a reader + * or another writer preempting, which would result in a live lock. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @@ -2572,9 +2581,13 @@ static inline unsigned start_dir_add(struct inode *dir) } } -static inline void end_dir_add(struct inode *dir, unsigned n) +static inline void end_dir_add(struct inode *dir, unsigned int n, + wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) @@ -2701,32 +2714,50 @@ mismatch: } EXPORT_SYMBOL(d_alloc_parallel); -void __d_lookup_done(struct dentry *dentry) +/* + * - Unhash the dentry + * - Retrieve and clear the waitqueue head in dentry + * - Return the waitqueue head + */ +static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) { - struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent, - dentry->d_name.hash); + wait_queue_head_t *d_wait; + struct hlist_bl_head *b; + + lockdep_assert_held(&dentry->d_lock); + + b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); - wake_up_all(dentry->d_wait); + d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); + return d_wait; +} + +void __d_lookup_unhash_wake(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + wake_up_all(__d_lookup_unhash(dentry)); + spin_unlock(&dentry->d_lock); } -EXPORT_SYMBOL(__d_lookup_done); +EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { + wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); - __d_lookup_done(dentry); + d_wait = __d_lookup_unhash(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); @@ -2738,7 +2769,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) } __d_rehash(dentry); if (dir) - end_dir_add(dir, n); + end_dir_add(dir, n, d_wait); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); @@ -2885,6 +2916,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; + wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; @@ -2915,7 +2947,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); - __d_lookup_done(target); + d_wait = __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); @@ -2951,7 +2983,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_end(&dentry->d_seq); if (dir) - end_dir_add(dir, n); + end_dir_add(dir, n, d_wait); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); diff --git a/fs/direct-io.c b/fs/direct-io.c index 840752006f60..df5e2d048799 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -117,8 +117,7 @@ struct dio_submit { /* dio_state communicated between submission path and end_io */ struct dio { int flags; /* doesn't change */ - int op; - int op_flags; + blk_opf_t opf; /* request operation type and flags */ struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ @@ -167,12 +166,13 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) */ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; ssize_t ret; ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, &sdio->from); - if (ret < 0 && sdio->blocks_available && (dio->op == REQ_OP_WRITE)) { + if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) { struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding @@ -234,6 +234,7 @@ static inline struct page *dio_get_page(struct dio *dio, */ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; int err; @@ -251,7 +252,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) transferred = dio->result; /* Check for short read case */ - if ((dio->op == REQ_OP_READ) && + if (dio_op == REQ_OP_READ && ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; /* ignore EFAULT if some IO has been done */ @@ -286,7 +287,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) * zeros from unwritten extents. */ if (flags & DIO_COMPLETE_INVALIDATE && - ret > 0 && dio->op == REQ_OP_WRITE && + ret > 0 && dio_op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages) { err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, @@ -305,7 +306,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) */ dio->iocb->ki_pos += transferred; - if (ret > 0 && dio->op == REQ_OP_WRITE) + if (ret > 0 && dio_op == REQ_OP_WRITE) ret = generic_write_sync(dio->iocb, ret); dio->iocb->ki_complete(dio->iocb, ret); } @@ -329,6 +330,7 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); static void dio_bio_end_aio(struct bio *bio) { struct dio *dio = bio->bi_private; + const enum req_op dio_op = dio->opf & REQ_OP_MASK; unsigned long remaining; unsigned long flags; bool defer_completion = false; @@ -353,7 +355,7 @@ static void dio_bio_end_aio(struct bio *bio) */ if (dio->result) defer_completion = dio->defer_completion || - (dio->op == REQ_OP_WRITE && + (dio_op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages); if (defer_completion) { INIT_WORK(&dio->complete_work, dio_aio_complete_work); @@ -396,7 +398,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, * bio_alloc() is guaranteed to return a bio when allowed to sleep and * we request a valid number of vectors. */ - bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL); + bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL); bio->bi_iter.bi_sector = first_sector; if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; @@ -415,6 +417,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, */ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; struct bio *bio = sdio->bio; unsigned long flags; @@ -426,7 +429,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) + if (dio->is_async && dio_op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); dio->bio_disk = bio->bi_bdev->bd_disk; @@ -492,7 +495,8 @@ static struct bio *dio_await_one(struct dio *dio) static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { blk_status_t err = bio->bi_status; - bool should_dirty = dio->op == REQ_OP_READ && dio->should_dirty; + const enum req_op dio_op = dio->opf & REQ_OP_MASK; + bool should_dirty = dio_op == REQ_OP_READ && dio->should_dirty; if (err) { if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) @@ -619,6 +623,7 @@ static int dio_set_defer_completion(struct dio *dio) static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ @@ -653,7 +658,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, * which may decide to handle it or also return an unmapped * buffer head. */ - create = dio->op == REQ_OP_WRITE; + create = dio_op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { i_size = i_size_read(dio->inode); if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) @@ -801,10 +806,11 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, unsigned offset, unsigned len, sector_t blocknr, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; int ret = 0; int boundary = sdio->boundary; /* dio_send_cur_page may clear it */ - if (dio->op == REQ_OP_WRITE) { + if (dio_op == REQ_OP_WRITE) { /* * Read accounting is performed in submit_bio() */ @@ -917,6 +923,7 @@ static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; const unsigned blkbits = sdio->blkbits; const unsigned i_blkbits = blkbits + sdio->blkfactor; int ret = 0; @@ -992,7 +999,7 @@ do_holes: loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ - if (dio->op == REQ_OP_WRITE) { + if (dio_op == REQ_OP_WRITE) { put_page(page); return -ENOTBLK; } @@ -1196,12 +1203,11 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->inode = inode; if (iov_iter_rw(iter) == WRITE) { - dio->op = REQ_OP_WRITE; - dio->op_flags = REQ_SYNC | REQ_IDLE; + dio->opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; if (iocb->ki_flags & IOCB_NOWAIT) - dio->op_flags |= REQ_NOWAIT; + dio->opf |= REQ_NOWAIT; } else { - dio->op = REQ_OP_READ; + dio->opf = REQ_OP_READ; } /* @@ -1210,7 +1216,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (dio->is_async && iov_iter_rw(iter) == WRITE) { retval = 0; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) retval = dio_set_defer_completion(dio); else if (!dio->inode->i_sb->s_dio_done_wq) { /* diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index ee92634196a8..1105ce3c80cb 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -9,6 +9,15 @@ menuconfig DLM A general purpose distributed lock manager for kernel or userspace applications. +config DLM_DEPRECATED_API + bool "DLM deprecated API" + depends on DLM + help + Enables deprecated DLM timeout features that will be removed in + later Linux kernel releases. + + If you are unsure, say N. + config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 3545fdafc6fb..71dab733cf9a 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -9,7 +9,6 @@ dlm-y := ast.o \ member.o \ memory.o \ midcomms.o \ - netlink.o \ lowcomms.o \ plock.o \ rcom.o \ @@ -18,5 +17,6 @@ dlm-y := ast.o \ requestqueue.o \ user.o \ util.o +dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index bfac462dd3e8..19ef136f9e4f 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -255,13 +255,13 @@ void dlm_callback_work(struct work_struct *work) if (callbacks[i].flags & DLM_CB_SKIP) { continue; } else if (callbacks[i].flags & DLM_CB_BAST) { - bastfn(lkb->lkb_astparam, callbacks[i].mode); trace_dlm_bast(ls, lkb, callbacks[i].mode); + bastfn(lkb->lkb_astparam, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + trace_dlm_ast(ls, lkb); castfn(lkb->lkb_astparam); - trace_dlm_ast(ls, lkb, lkb->lkb_lksb); } } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 42eee2783756..ac8b62106ce0 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -75,8 +75,9 @@ struct dlm_cluster { unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_mark; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned int cl_timewarn_cs; - unsigned int cl_waitwarn_us; +#endif unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; @@ -102,8 +103,9 @@ enum { CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR_TIMEWARN_CS, - CLUSTER_ATTR_WAITWARN_US, +#endif CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, @@ -224,8 +226,9 @@ CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR(timewarn_cs, dlm_check_zero); -CLUSTER_ATTR(waitwarn_us, NULL); +#endif CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); @@ -240,8 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, +#ifdef CONFIG_DLM_DEPRECATED_API [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, - [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, +#endif [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, @@ -432,8 +436,9 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; +#ifdef CONFIG_DLM_DEPRECATED_API cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; - cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; +#endif cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, @@ -954,8 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 +#ifdef CONFIG_DLM_DEPRECATED_API #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ -#define DEFAULT_WAITWARN_US 0 +#endif #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" @@ -971,8 +977,9 @@ struct dlm_config_info dlm_config = { .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, - .ci_waitwarn_us = DEFAULT_WAITWARN_US, +#endif .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME diff --git a/fs/dlm/config.h b/fs/dlm/config.h index df92b0a07fc6..55c5f2c13ebd 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -37,8 +37,9 @@ struct dlm_config_info { int ci_log_info; int ci_protocol; int ci_mark; +#ifdef CONFIG_DLM_DEPRECATED_API int ci_timewarn_cs; - int ci_waitwarn_us; +#endif int ci_new_rsb_count; int ci_recover_callbacks; char ci_cluster_name[DLM_LOCKSPACE_LEN]; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 776c3ed519f0..8aca8085d24e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -145,7 +145,9 @@ struct dlm_args { void (*bastfn) (void *astparam, int mode); int mode; struct dlm_lksb *lksb; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned long timeout; +#endif }; @@ -203,10 +205,20 @@ struct dlm_args { #define DLM_IFL_OVERLAP_UNLOCK 0x00080000 #define DLM_IFL_OVERLAP_CANCEL 0x00100000 #define DLM_IFL_ENDOFLIFE 0x00200000 +#ifdef CONFIG_DLM_DEPRECATED_API #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +/* least significant 2 bytes are message changed, they are full transmitted + * but at receive side only the 2 bytes LSB will be set. + * + * Even wireshark dlm dissector does only evaluate the lower bytes and note + * that they may not be used on transceiver side, we assume the higher bytes + * are for internal use or reserved so long they are not parsed on receiver + * side. + */ #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 @@ -249,10 +261,12 @@ struct dlm_lkb { struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ struct list_head lkb_wait_reply; /* waiting for remote reply */ struct list_head lkb_ownqueue; /* list of locks for a process */ - struct list_head lkb_time_list; ktime_t lkb_timestamp; - ktime_t lkb_wait_time; + +#ifdef CONFIG_DLM_DEPRECATED_API + struct list_head lkb_time_list; unsigned long lkb_timeout_cs; +#endif struct mutex lkb_cb_mutex; struct work_struct lkb_cb_work; @@ -568,8 +582,10 @@ struct dlm_ls { struct mutex ls_orphans_mutex; struct list_head ls_orphans; +#ifdef CONFIG_DLM_DEPRECATED_API struct mutex ls_timeout_mutex; struct list_head ls_timeout; +#endif spinlock_t ls_new_rsb_spin; int ls_new_rsb_count; @@ -606,8 +622,8 @@ struct dlm_ls { wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; - struct completion ls_members_done; - int ls_members_result; + struct completion ls_recovery_done; + int ls_recovery_result; struct miscdevice ls_device; @@ -688,7 +704,9 @@ struct dlm_ls { #define LSFL_RCOM_READY 5 #define LSFL_RCOM_WAIT 6 #define LSFL_UEVENT_WAIT 7 +#ifdef CONFIG_DLM_DEPRECATED_API #define LSFL_TIMEWARN 8 +#endif #define LSFL_CB_DELAY 9 #define LSFL_NODIR 10 @@ -741,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return test_bit(LSFL_NODIR, &ls->ls_flags); } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_netlink_init(void); void dlm_netlink_exit(void); void dlm_timeout_warn(struct dlm_lkb *lkb); +#else +static inline int dlm_netlink_init(void) { return 0; } +static inline void dlm_netlink_exit(void) { }; +static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { }; +#endif int dlm_plock_init(void); void dlm_plock_exit(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 226822f49d30..dac7eb75dba9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); +#ifdef CONFIG_DLM_DEPRECATED_API /* if the operation was a cancel, then return -DLM_ECANCEL, if a timeout caused the cancel then return -ETIMEDOUT */ if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; rv = -ETIMEDOUT; } +#endif if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; @@ -1210,7 +1212,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&lkb->lkb_time_list); +#endif INIT_LIST_HEAD(&lkb->lkb_cb_list); mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); @@ -1306,6 +1310,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb) kref_get(&lkb->lkb_ref); } +static void unhold_lkb_assert(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + DLM_ASSERT(false, dlm_print_lkb(lkb);); +} + /* This is called when we need to remove a reference and are certain it's not the last ref. e.g. del_lkb is always called between a find_lkb/put_lkb and is always the inverse of a previous add_lkb. @@ -1313,9 +1324,7 @@ static inline void hold_lkb(struct dlm_lkb *lkb) static inline void unhold_lkb(struct dlm_lkb *lkb) { - int rv; - rv = kref_put(&lkb->lkb_ref, kill_lkb); - DLM_ASSERT(!rv, dlm_print_lkb(lkb);); + kref_put(&lkb->lkb_ref, unhold_lkb_assert); } static void lkb_add_ordered(struct list_head *new, struct list_head *head, @@ -1402,75 +1411,6 @@ static int msg_reply_type(int mstype) return -1; } -static int nodeid_warned(int nodeid, int num_nodes, int *warned) -{ - int i; - - for (i = 0; i < num_nodes; i++) { - if (!warned[i]) { - warned[i] = nodeid; - return 0; - } - if (warned[i] == nodeid) - return 1; - } - return 0; -} - -void dlm_scan_waiters(struct dlm_ls *ls) -{ - struct dlm_lkb *lkb; - s64 us; - s64 debug_maxus = 0; - u32 debug_scanned = 0; - u32 debug_expired = 0; - int num_nodes = 0; - int *warned = NULL; - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (!lkb->lkb_wait_time) - continue; - - debug_scanned++; - - us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); - - if (us < dlm_config.ci_waitwarn_us) - continue; - - lkb->lkb_wait_time = 0; - - debug_expired++; - if (us > debug_maxus) - debug_maxus = us; - - if (!num_nodes) { - num_nodes = ls->ls_num_nodes; - warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL); - } - if (!warned) - continue; - if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) - continue; - - log_error(ls, "waitwarn %x %lld %d us check connection to " - "node %d", lkb->lkb_id, (long long)us, - dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); - } - mutex_unlock(&ls->ls_waiters_mutex); - kfree(warned); - - if (debug_expired) - log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", - debug_scanned, debug_expired, - dlm_config.ci_waitwarn_us, (long long)debug_maxus); -} - /* add/remove lkb from global waiters list of lkb's waiting for a reply from a remote node */ @@ -1514,7 +1454,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) lkb->lkb_wait_count++; lkb->lkb_wait_type = mstype; - lkb->lkb_wait_time = ktime_get(); lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ hold_lkb(lkb); list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); @@ -1842,6 +1781,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls) } } +#ifdef CONFIG_DLM_DEPRECATED_API static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; @@ -1962,17 +1902,11 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (ktime_to_us(lkb->lkb_wait_time)) - lkb->lkb_wait_time = ktime_get(); - } - mutex_unlock(&ls->ls_waiters_mutex); } +#else +static void add_timeout(struct dlm_lkb *lkb) { } +static void del_timeout(struct dlm_lkb *lkb) { } +#endif /* lkb is master or local copy */ @@ -2837,12 +2771,20 @@ static void confirm_master(struct dlm_rsb *r, int error) } } +#ifdef CONFIG_DLM_DEPRECATED_API static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, int namelen, unsigned long timeout_cs, void (*ast) (void *astparam), void *astparam, void (*bast) (void *astparam, int mode), struct dlm_args *args) +#else +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, void (*ast)(void *astparam), + void *astparam, + void (*bast)(void *astparam, int mode), + struct dlm_args *args) +#endif { int rv = -EINVAL; @@ -2895,7 +2837,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, args->astfn = ast; args->astparam = astparam; args->bastfn = bast; +#ifdef CONFIG_DLM_DEPRECATED_API args->timeout = timeout_cs; +#endif args->mode = mode; args->lksb = lksb; rv = 0; @@ -2951,7 +2895,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_lksb = args->lksb; lkb->lkb_lvbptr = args->lksb->sb_lvbptr; lkb->lkb_ownpid = (int) current->pid; +#ifdef CONFIG_DLM_DEPRECATED_API lkb->lkb_timeout_cs = args->timeout; +#endif rv = 0; out: if (rv) @@ -3472,10 +3418,15 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; - trace_dlm_lock_start(ls, lkb, mode, flags); + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); +#else + error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast, + &args); +#endif if (error) goto out_put; @@ -3487,7 +3438,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: - trace_dlm_lock_end(ls, lkb, mode, flags, error); + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error); if (convert || error) __put_lkb(ls, lkb); @@ -5839,9 +5790,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) return 0; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs) +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5864,8 +5820,13 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) { kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; @@ -5904,9 +5865,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, return error; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs) +#else +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5941,8 +5907,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->bastaddr = ua_tmp->bastaddr; ua->user_lksb = ua_tmp->user_lksb; +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) goto out_put; @@ -5966,7 +5937,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid) + uint32_t *lkid) { struct dlm_lkb *lkb = NULL, *iter; struct dlm_user_args *ua; diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 252a5898f908..a7b6474f009d 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,9 +24,15 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -void dlm_scan_waiters(struct dlm_ls *ls); + +#ifdef CONFIG_DLM_DEPRECATED_API void dlm_scan_timeout(struct dlm_ls *ls); void dlm_adjust_timeouts(struct dlm_ls *ls); +#else +static inline void dlm_scan_timeout(struct dlm_ls *ls) { } +static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } +#endif + int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); @@ -41,15 +47,22 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs); int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs); +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in); +#endif int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid); + uint32_t *lkid); int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in); int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 19ed41a5da93..3972f4d86c75 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -275,7 +275,6 @@ static int dlm_scand(void *data) ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); - dlm_scan_waiters(ls); dlm_unlock_recovery(ls); } else { ls->ls_scan_time += HZ; @@ -490,13 +489,28 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_ops_arg = ops_arg; } - if (flags & DLM_LSFL_TIMEWARN) +#ifdef CONFIG_DLM_DEPRECATED_API + if (flags & DLM_LSFL_TIMEWARN) { + pr_warn_once("===============================================================\n" + "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n" + " will be removed in v6.2!\n" + " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n" + "===============================================================\n"); + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + } /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have some flags set */ + * need to require all nodes to have some flags set + */ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#else + /* ls_exflags are forced to match among nodes, and we don't + * need to require all nodes to have some flags set + */ + ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#endif size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; @@ -527,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_waiters_mutex); INIT_LIST_HEAD(&ls->ls_orphans); mutex_init(&ls->ls_orphans_mutex); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&ls->ls_timeout); mutex_init(&ls->ls_timeout_mutex); +#endif INIT_LIST_HEAD(&ls->ls_new_rsb); spin_lock_init(&ls->ls_new_rsb_spin); @@ -548,8 +564,8 @@ static int new_lockspace(const char *name, const char *cluster, init_waitqueue_head(&ls->ls_uevent_wait); ls->ls_uevent_result = 0; - init_completion(&ls->ls_members_done); - ls->ls_members_result = -1; + init_completion(&ls->ls_recovery_done); + ls->ls_recovery_result = -1; mutex_init(&ls->ls_cb_mutex); INIT_LIST_HEAD(&ls->ls_cb_delay); @@ -645,8 +661,9 @@ static int new_lockspace(const char *name, const char *cluster, if (error) goto out_recoverd; - wait_for_completion(&ls->ls_members_done); - error = ls->ls_members_result; + /* wait until recovery is successful or failed */ + wait_for_completion(&ls->ls_recovery_done); + error = ls->ls_recovery_result; if (error) goto out_members; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 19e82f08c0e0..a4e84e8d94c8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -529,7 +529,7 @@ static void lowcomms_write_space(struct sock *sk) return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); return; } @@ -1931,7 +1931,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, return ret; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); return 0; } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 98084e0cfccf..2af2ccfe43a9 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -534,7 +534,11 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) int i, error, neg = 0, low = -1; /* previously removed members that we've not finished removing need to - count as a negative change so the "neg" recovery steps will happen */ + * count as a negative change so the "neg" recovery steps will happen + * + * This functionality must report all member changes to lsops or + * midcomms layer and must never return before. + */ list_for_each_entry(memb, &ls->ls_nodes_gone, list) { log_rinfo(ls, "prev removed member %d", memb->nodeid); @@ -583,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - /* error -EINTR means that a new recovery action is triggered. - * We ignore this recovery action and let run the new one which might - * have new member configuration. - */ - if (error == -EINTR) - error = 0; - - /* new_lockspace() may be waiting to know if the config - * is good or bad - */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } @@ -675,7 +666,16 @@ int dlm_ls_stop(struct dlm_ls *ls) if (!ls->ls_recover_begin) ls->ls_recover_begin = jiffies; - dlm_lsop_recover_prep(ls); + /* call recover_prep ops only once and not multiple times + * for each possible dlm_ls_stop() when recovery is already + * stopped. + * + * If we successful was able to clear LSFL_RUNNING bit and + * it was set we know it is the first dlm_ls_stop() call. + */ + if (new) + dlm_lsop_recover_prep(ls); + return 0; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 0993eebf2060..737f185aad8d 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -29,6 +29,8 @@ struct plock_async_data { struct plock_op { struct list_head list; int done; + /* if lock op got interrupted while waiting dlm_controld reply */ + bool sigint; struct dlm_plock_info info; /* if set indicates async handling */ struct plock_async_data *data; @@ -79,8 +81,7 @@ static void send_op(struct plock_op *op) abandoned waiter. So, we have to insert the unlock-close when the lock call is interrupted. */ -static void do_unlock_close(struct dlm_ls *ls, u64 number, - struct file *file, struct file_lock *fl) +static void do_unlock_close(const struct dlm_plock_info *info) { struct plock_op *op; @@ -89,15 +90,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number, return; op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = fl->fl_pid; - op->info.fsid = ls->ls_global_id; - op->info.number = number; + op->info.pid = info->pid; + op->info.fsid = info->fsid; + op->info.number = info->number; op->info.start = 0; op->info.end = OFFSET_MAX; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = info->owner; op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); @@ -161,16 +159,24 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { spin_lock(&ops_lock); - list_del(&op->list); + /* recheck under ops_lock if we got a done != 0, + * if so this interrupt case should be ignored + */ + if (op->done != 0) { + spin_unlock(&ops_lock); + goto do_lock_wait; + } + + op->sigint = true; spin_unlock(&ops_lock); - log_print("%s: wait interrupted %x %llx, op removed", + log_debug(ls, "%s: wait interrupted %x %llx pid %d", __func__, ls->ls_global_id, - (unsigned long long)number); - dlm_release_plock_op(op); - do_unlock_close(ls, number, file, fl); + (unsigned long long)number, op->info.pid); goto out; } +do_lock_wait: + WARN_ON(!list_empty(&op->list)); rv = op->info.rv; @@ -378,7 +384,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, spin_lock(&ops_lock); if (!list_empty(&send_list)) { - op = list_entry(send_list.next, struct plock_op, list); + op = list_first_entry(&send_list, struct plock_op, list); if (op->info.flags & DLM_PLOCK_FL_CLOSE) list_del(&op->list); else @@ -425,6 +431,19 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (iter->info.fsid == info.fsid && iter->info.number == info.number && iter->info.owner == info.owner) { + if (iter->sigint) { + list_del(&iter->list); + spin_unlock(&ops_lock); + + pr_debug("%s: sigint cleanup %x %llx pid %d", + __func__, iter->info.fsid, + (unsigned long long)iter->info.number, + iter->info.pid); + do_unlock_close(&iter->info); + memcpy(&iter->info, &info, sizeof(info)); + dlm_release_plock_op(iter); + return count; + } list_del_init(&iter->list); memcpy(&iter->info, &info, sizeof(info)); if (iter->data) @@ -443,7 +462,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, else wake_up(&recv_wq); } else - log_print("%s: no op %x %llx - may got interrupted?", __func__, + log_print("%s: no op %x %llx", __func__, info.fsid, (unsigned long long)info.number); return count; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index a55dfce705dd..e15eb511b04b 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -70,6 +70,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) /* * Add or remove nodes from the lockspace's ls_nodes list. + * + * Due to the fact that we must report all membership changes to lsops + * or midcomms layer, it is not permitted to abort ls_recover() until + * this is done. */ error = dlm_recover_members(ls, rv, &neg); @@ -239,14 +243,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); - dlm_lsop_recover_done(ls); return 0; fail: dlm_release_root_list(ls); - log_rinfo(ls, "dlm_recover %llu error %d", - (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); + return error; } @@ -257,6 +259,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; + int error; spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; @@ -266,7 +269,31 @@ static void do_ls_recovery(struct dlm_ls *ls) spin_unlock(&ls->ls_recover_lock); if (rv) { - ls_recover(ls, rv); + error = ls_recover(ls, rv); + switch (error) { + case 0: + ls->ls_recovery_result = 0; + complete(&ls->ls_recovery_done); + + dlm_lsop_recover_done(ls); + break; + case -EINTR: + /* if recovery was interrupted -EINTR we wait for the next + * ls_recover() iteration until it hopefully succeeds. + */ + log_rinfo(ls, "%s %llu interrupted and should be queued to run again", + __func__, (unsigned long long)rv->seq); + break; + default: + log_rinfo(ls, "%s %llu error %d", __func__, + (unsigned long long)rv->seq, error); + + /* let new_lockspace() get aware of critical error */ + ls->ls_recovery_result = error; + complete(&ls->ls_recovery_done); + break; + } + kfree(rv->nodes); kfree(rv); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1060b24f18d4..99e8f0744513 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -250,6 +250,14 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } +#ifdef CONFIG_DLM_DEPRECATED_API + if (params->timeout) + pr_warn_once("========================================================\n" + "WARNING: the lkb timeout feature is being deprecated and\n" + " will be removed in v6.2!\n" + "========================================================\n"); +#endif + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; @@ -262,23 +270,34 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb, (unsigned long) params->timeout); +#else + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb); +#endif } else if (params->flags & DLM_LKF_ORPHAN) { error = dlm_user_adopt_orphan(ls, ua, params->mode, params->flags, params->name, params->namelen, - (unsigned long) params->timeout, &lkid); if (!error) error = lkid; } else { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, (unsigned long) params->timeout); +#else + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen); +#endif if (!error) error = ua->lksb.sb_lkid; } diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile index 0b1c5e63eb71..7bfc2f9754a8 100644 --- a/fs/efivarfs/Makefile +++ b/fs/efivarfs/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_EFIVAR_FS) += efivarfs.o -efivarfs-objs := inode.o file.o super.o +efivarfs-objs := inode.o file.o super.o vars.o diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index 30ae44cb7453..8ebf3a6a8aa2 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -7,6 +7,46 @@ #define EFIVAR_FS_INTERNAL_H #include <linux/list.h> +#include <linux/efi.h> + +struct efi_variable { + efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)]; + efi_guid_t VendorGuid; + unsigned long DataSize; + __u8 Data[1024]; + efi_status_t Status; + __u32 Attributes; +} __attribute__((packed)); + +struct efivar_entry { + struct efi_variable var; + struct list_head list; + struct kobject kobj; +}; + +int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), + void *data, bool duplicates, struct list_head *head); + +int efivar_entry_add(struct efivar_entry *entry, struct list_head *head); +void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head); +void efivar_entry_remove(struct efivar_entry *entry); +int efivar_entry_delete(struct efivar_entry *entry); + +int efivar_entry_size(struct efivar_entry *entry, unsigned long *size); +int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes, + unsigned long *size, void *data); +int efivar_entry_get(struct efivar_entry *entry, u32 *attributes, + unsigned long *size, void *data); +int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, + unsigned long *size, void *data, bool *set); + +int efivar_entry_iter(int (*func)(struct efivar_entry *, void *), + struct list_head *head, void *data); + +bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data, + unsigned long data_size); +bool efivar_variable_is_removable(efi_guid_t vendor, const char *name, + size_t len); extern const struct file_operations efivarfs_file_operations; extern const struct inode_operations efivarfs_dir_inode_operations; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 15880a68faad..6780fc81cc11 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -155,10 +155,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, goto fail_inode; } - efivar_entry_size(entry, &size); - err = efivar_entry_add(entry, &efivarfs_list); - if (err) - goto fail_inode; + __efivar_entry_get(entry, NULL, &size, NULL); + __efivar_entry_add(entry, &efivarfs_list); /* copied by the above to local storage in the dentry. */ kfree(name); @@ -182,10 +180,7 @@ fail: static int efivarfs_destroy(struct efivar_entry *entry, void *data) { - int err = efivar_entry_remove(entry); - - if (err) - return err; + efivar_entry_remove(entry); kfree(entry); return 0; } @@ -221,7 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list); if (err) - __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); return err; } @@ -246,7 +241,7 @@ static void efivarfs_kill_sb(struct super_block *sb) kill_litter_super(sb); /* Remove all entries and destroy */ - __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); } static struct file_system_type efivarfs_type = { diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c new file mode 100644 index 000000000000..a0ef63cfcecb --- /dev/null +++ b/fs/efivarfs/vars.c @@ -0,0 +1,738 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Originally from efivars.c + * + * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com> + * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com> + */ + +#include <linux/capability.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/efi.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/ucs2_string.h> + +#include "internal.h" + +MODULE_IMPORT_NS(EFIVAR); + +static bool +validate_device_path(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + struct efi_generic_dev_path *node; + int offset = 0; + + node = (struct efi_generic_dev_path *)buffer; + + if (len < sizeof(*node)) + return false; + + while (offset <= len - sizeof(*node) && + node->length >= sizeof(*node) && + node->length <= len - offset) { + offset += node->length; + + if ((node->type == EFI_DEV_END_PATH || + node->type == EFI_DEV_END_PATH2) && + node->sub_type == EFI_DEV_END_ENTIRE) + return true; + + node = (struct efi_generic_dev_path *)(buffer + offset); + } + + /* + * If we're here then either node->length pointed past the end + * of the buffer or we reached the end of the buffer without + * finding a device path end node. + */ + return false; +} + +static bool +validate_boot_order(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + /* An array of 16-bit integers */ + if ((len % 2) != 0) + return false; + + return true; +} + +static bool +validate_load_option(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + u16 filepathlength; + int i, desclength = 0, namelen; + + namelen = ucs2_strnlen(var_name, EFI_VAR_NAME_LEN); + + /* Either "Boot" or "Driver" followed by four digits of hex */ + for (i = match; i < match+4; i++) { + if (var_name[i] > 127 || + hex_to_bin(var_name[i] & 0xff) < 0) + return true; + } + + /* Reject it if there's 4 digits of hex and then further content */ + if (namelen > match + 4) + return false; + + /* A valid entry must be at least 8 bytes */ + if (len < 8) + return false; + + filepathlength = buffer[4] | buffer[5] << 8; + + /* + * There's no stored length for the description, so it has to be + * found by hand + */ + desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2; + + /* Each boot entry must have a descriptor */ + if (!desclength) + return false; + + /* + * If the sum of the length of the description, the claimed filepath + * length and the original header are greater than the length of the + * variable, it's malformed + */ + if ((desclength + filepathlength + 6) > len) + return false; + + /* + * And, finally, check the filepath + */ + return validate_device_path(var_name, match, buffer + desclength + 6, + filepathlength); +} + +static bool +validate_uint16(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + /* A single 16-bit integer */ + if (len != 2) + return false; + + return true; +} + +static bool +validate_ascii_string(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + int i; + + for (i = 0; i < len; i++) { + if (buffer[i] > 127) + return false; + + if (buffer[i] == 0) + return true; + } + + return false; +} + +struct variable_validate { + efi_guid_t vendor; + char *name; + bool (*validate)(efi_char16_t *var_name, int match, u8 *data, + unsigned long len); +}; + +/* + * This is the list of variables we need to validate, as well as the + * whitelist for what we think is safe not to default to immutable. + * + * If it has a validate() method that's not NULL, it'll go into the + * validation routine. If not, it is assumed valid, but still used for + * whitelisting. + * + * Note that it's sorted by {vendor,name}, but globbed names must come after + * any other name with the same prefix. + */ +static const struct variable_validate variable_validate[] = { + { EFI_GLOBAL_VARIABLE_GUID, "BootNext", validate_uint16 }, + { EFI_GLOBAL_VARIABLE_GUID, "BootOrder", validate_boot_order }, + { EFI_GLOBAL_VARIABLE_GUID, "Boot*", validate_load_option }, + { EFI_GLOBAL_VARIABLE_GUID, "DriverOrder", validate_boot_order }, + { EFI_GLOBAL_VARIABLE_GUID, "Driver*", validate_load_option }, + { EFI_GLOBAL_VARIABLE_GUID, "ConIn", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ConInDev", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ConOut", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ConOutDev", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ErrOut", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ErrOutDev", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "Lang", validate_ascii_string }, + { EFI_GLOBAL_VARIABLE_GUID, "OsIndications", NULL }, + { EFI_GLOBAL_VARIABLE_GUID, "PlatformLang", validate_ascii_string }, + { EFI_GLOBAL_VARIABLE_GUID, "Timeout", validate_uint16 }, + { LINUX_EFI_CRASH_GUID, "*", NULL }, + { NULL_GUID, "", NULL }, +}; + +/* + * Check if @var_name matches the pattern given in @match_name. + * + * @var_name: an array of @len non-NUL characters. + * @match_name: a NUL-terminated pattern string, optionally ending in "*". A + * final "*" character matches any trailing characters @var_name, + * including the case when there are none left in @var_name. + * @match: on output, the number of non-wildcard characters in @match_name + * that @var_name matches, regardless of the return value. + * @return: whether @var_name fully matches @match_name. + */ +static bool +variable_matches(const char *var_name, size_t len, const char *match_name, + int *match) +{ + for (*match = 0; ; (*match)++) { + char c = match_name[*match]; + + switch (c) { + case '*': + /* Wildcard in @match_name means we've matched. */ + return true; + + case '\0': + /* @match_name has ended. Has @var_name too? */ + return (*match == len); + + default: + /* + * We've reached a non-wildcard char in @match_name. + * Continue only if there's an identical character in + * @var_name. + */ + if (*match < len && c == var_name[*match]) + continue; + return false; + } + } +} + +bool +efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data, + unsigned long data_size) +{ + int i; + unsigned long utf8_size; + u8 *utf8_name; + + utf8_size = ucs2_utf8size(var_name); + utf8_name = kmalloc(utf8_size + 1, GFP_KERNEL); + if (!utf8_name) + return false; + + ucs2_as_utf8(utf8_name, var_name, utf8_size); + utf8_name[utf8_size] = '\0'; + + for (i = 0; variable_validate[i].name[0] != '\0'; i++) { + const char *name = variable_validate[i].name; + int match = 0; + + if (efi_guidcmp(vendor, variable_validate[i].vendor)) + continue; + + if (variable_matches(utf8_name, utf8_size+1, name, &match)) { + if (variable_validate[i].validate == NULL) + break; + kfree(utf8_name); + return variable_validate[i].validate(var_name, match, + data, data_size); + } + } + kfree(utf8_name); + return true; +} + +bool +efivar_variable_is_removable(efi_guid_t vendor, const char *var_name, + size_t len) +{ + int i; + bool found = false; + int match = 0; + + /* + * Check if our variable is in the validated variables list + */ + for (i = 0; variable_validate[i].name[0] != '\0'; i++) { + if (efi_guidcmp(variable_validate[i].vendor, vendor)) + continue; + + if (variable_matches(var_name, len, + variable_validate[i].name, &match)) { + found = true; + break; + } + } + + /* + * If it's in our list, it is removable. + */ + return found; +} + +static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor, + struct list_head *head) +{ + struct efivar_entry *entry, *n; + unsigned long strsize1, strsize2; + bool found = false; + + strsize1 = ucs2_strsize(variable_name, 1024); + list_for_each_entry_safe(entry, n, head, list) { + strsize2 = ucs2_strsize(entry->var.VariableName, 1024); + if (strsize1 == strsize2 && + !memcmp(variable_name, &(entry->var.VariableName), + strsize2) && + !efi_guidcmp(entry->var.VendorGuid, + *vendor)) { + found = true; + break; + } + } + return found; +} + +/* + * Returns the size of variable_name, in bytes, including the + * terminating NULL character, or variable_name_size if no NULL + * character is found among the first variable_name_size bytes. + */ +static unsigned long var_name_strnsize(efi_char16_t *variable_name, + unsigned long variable_name_size) +{ + unsigned long len; + efi_char16_t c; + + /* + * The variable name is, by definition, a NULL-terminated + * string, so make absolutely sure that variable_name_size is + * the value we expect it to be. If not, return the real size. + */ + for (len = 2; len <= variable_name_size; len += sizeof(c)) { + c = variable_name[(len / sizeof(c)) - 1]; + if (!c) + break; + } + + return min(len, variable_name_size); +} + +/* + * Print a warning when duplicate EFI variables are encountered and + * disable the sysfs workqueue since the firmware is buggy. + */ +static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, + unsigned long len16) +{ + size_t i, len8 = len16 / sizeof(efi_char16_t); + char *str8; + + str8 = kzalloc(len8, GFP_KERNEL); + if (!str8) + return; + + for (i = 0; i < len8; i++) + str8[i] = str16[i]; + + printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n", + str8, vendor_guid); + kfree(str8); +} + +/** + * efivar_init - build the initial list of EFI variables + * @func: callback function to invoke for every variable + * @data: function-specific data to pass to @func + * @duplicates: error if we encounter duplicates on @head? + * @head: initialised head of variable list + * + * Get every EFI variable from the firmware and invoke @func. @func + * should call efivar_entry_add() to build the list of variables. + * + * Returns 0 on success, or a kernel error code on failure. + */ +int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), + void *data, bool duplicates, struct list_head *head) +{ + unsigned long variable_name_size = 1024; + efi_char16_t *variable_name; + efi_status_t status; + efi_guid_t vendor_guid; + int err = 0; + + variable_name = kzalloc(variable_name_size, GFP_KERNEL); + if (!variable_name) { + printk(KERN_ERR "efivars: Memory allocation failed.\n"); + return -ENOMEM; + } + + err = efivar_lock(); + if (err) + goto free; + + /* + * Per EFI spec, the maximum storage allocated for both + * the variable name and variable data is 1024 bytes. + */ + + do { + variable_name_size = 1024; + + status = efivar_get_next_variable(&variable_name_size, + variable_name, + &vendor_guid); + switch (status) { + case EFI_SUCCESS: + variable_name_size = var_name_strnsize(variable_name, + variable_name_size); + + /* + * Some firmware implementations return the + * same variable name on multiple calls to + * get_next_variable(). Terminate the loop + * immediately as there is no guarantee that + * we'll ever see a different variable name, + * and may end up looping here forever. + */ + if (duplicates && + variable_is_present(variable_name, &vendor_guid, + head)) { + dup_variable_bug(variable_name, &vendor_guid, + variable_name_size); + status = EFI_NOT_FOUND; + } else { + err = func(variable_name, vendor_guid, + variable_name_size, data); + if (err) + status = EFI_NOT_FOUND; + } + break; + case EFI_UNSUPPORTED: + err = -EOPNOTSUPP; + status = EFI_NOT_FOUND; + break; + case EFI_NOT_FOUND: + break; + default: + printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n", + status); + status = EFI_NOT_FOUND; + break; + } + + } while (status != EFI_NOT_FOUND); + + efivar_unlock(); +free: + kfree(variable_name); + + return err; +} + +/** + * efivar_entry_add - add entry to variable list + * @entry: entry to add to list + * @head: list head + * + * Returns 0 on success, or a kernel error code on failure. + */ +int efivar_entry_add(struct efivar_entry *entry, struct list_head *head) +{ + int err; + + err = efivar_lock(); + if (err) + return err; + list_add(&entry->list, head); + efivar_unlock(); + + return 0; +} + +/** + * __efivar_entry_add - add entry to variable list + * @entry: entry to add to list + * @head: list head + */ +void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head) +{ + list_add(&entry->list, head); +} + +/** + * efivar_entry_remove - remove entry from variable list + * @entry: entry to remove from list + * + * Returns 0 on success, or a kernel error code on failure. + */ +void efivar_entry_remove(struct efivar_entry *entry) +{ + list_del(&entry->list); +} + +/* + * efivar_entry_list_del_unlock - remove entry from variable list + * @entry: entry to remove + * + * Remove @entry from the variable list and release the list lock. + * + * NOTE: slightly weird locking semantics here - we expect to be + * called with the efivars lock already held, and we release it before + * returning. This is because this function is usually called after + * set_variable() while the lock is still held. + */ +static void efivar_entry_list_del_unlock(struct efivar_entry *entry) +{ + list_del(&entry->list); + efivar_unlock(); +} + +/** + * efivar_entry_delete - delete variable and remove entry from list + * @entry: entry containing variable to delete + * + * Delete the variable from the firmware and remove @entry from the + * variable list. It is the caller's responsibility to free @entry + * once we return. + * + * Returns 0 on success, -EINTR if we can't grab the semaphore, + * converted EFI status code if set_variable() fails. + */ +int efivar_entry_delete(struct efivar_entry *entry) +{ + efi_status_t status; + int err; + + err = efivar_lock(); + if (err) + return err; + + status = efivar_set_variable_locked(entry->var.VariableName, + &entry->var.VendorGuid, + 0, 0, NULL, false); + if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) { + efivar_unlock(); + return efi_status_to_err(status); + } + + efivar_entry_list_del_unlock(entry); + return 0; +} + +/** + * efivar_entry_size - obtain the size of a variable + * @entry: entry for this variable + * @size: location to store the variable's size + */ +int efivar_entry_size(struct efivar_entry *entry, unsigned long *size) +{ + efi_status_t status; + int err; + + *size = 0; + + err = efivar_lock(); + if (err) + return err; + + status = efivar_get_variable(entry->var.VariableName, + &entry->var.VendorGuid, NULL, size, NULL); + efivar_unlock(); + + if (status != EFI_BUFFER_TOO_SMALL) + return efi_status_to_err(status); + + return 0; +} + +/** + * __efivar_entry_get - call get_variable() + * @entry: read data for this variable + * @attributes: variable attributes + * @size: size of @data buffer + * @data: buffer to store variable data + * + * The caller MUST call efivar_entry_iter_begin() and + * efivar_entry_iter_end() before and after the invocation of this + * function, respectively. + */ +int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes, + unsigned long *size, void *data) +{ + efi_status_t status; + + status = efivar_get_variable(entry->var.VariableName, + &entry->var.VendorGuid, + attributes, size, data); + + return efi_status_to_err(status); +} + +/** + * efivar_entry_get - call get_variable() + * @entry: read data for this variable + * @attributes: variable attributes + * @size: size of @data buffer + * @data: buffer to store variable data + */ +int efivar_entry_get(struct efivar_entry *entry, u32 *attributes, + unsigned long *size, void *data) +{ + int err; + + err = efivar_lock(); + if (err) + return err; + err = __efivar_entry_get(entry, attributes, size, data); + efivar_unlock(); + + return 0; +} + +/** + * efivar_entry_set_get_size - call set_variable() and get new size (atomic) + * @entry: entry containing variable to set and get + * @attributes: attributes of variable to be written + * @size: size of data buffer + * @data: buffer containing data to write + * @set: did the set_variable() call succeed? + * + * This is a pretty special (complex) function. See efivarfs_file_write(). + * + * Atomically call set_variable() for @entry and if the call is + * successful, return the new size of the variable from get_variable() + * in @size. The success of set_variable() is indicated by @set. + * + * Returns 0 on success, -EINVAL if the variable data is invalid, + * -ENOSPC if the firmware does not have enough available space, or a + * converted EFI status code if either of set_variable() or + * get_variable() fail. + * + * If the EFI variable does not exist when calling set_variable() + * (EFI_NOT_FOUND), @entry is removed from the variable list. + */ +int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, + unsigned long *size, void *data, bool *set) +{ + efi_char16_t *name = entry->var.VariableName; + efi_guid_t *vendor = &entry->var.VendorGuid; + efi_status_t status; + int err; + + *set = false; + + if (efivar_validate(*vendor, name, data, *size) == false) + return -EINVAL; + + /* + * The lock here protects the get_variable call, the conditional + * set_variable call, and removal of the variable from the efivars + * list (in the case of an authenticated delete). + */ + err = efivar_lock(); + if (err) + return err; + + /* + * Ensure that the available space hasn't shrunk below the safe level + */ + status = check_var_size(attributes, *size + ucs2_strsize(name, 1024)); + if (status != EFI_SUCCESS) { + if (status != EFI_UNSUPPORTED) { + err = efi_status_to_err(status); + goto out; + } + + if (*size > 65536) { + err = -ENOSPC; + goto out; + } + } + + status = efivar_set_variable_locked(name, vendor, attributes, *size, + data, false); + if (status != EFI_SUCCESS) { + err = efi_status_to_err(status); + goto out; + } + + *set = true; + + /* + * Writing to the variable may have caused a change in size (which + * could either be an append or an overwrite), or the variable to be + * deleted. Perform a GetVariable() so we can tell what actually + * happened. + */ + *size = 0; + status = efivar_get_variable(entry->var.VariableName, + &entry->var.VendorGuid, + NULL, size, NULL); + + if (status == EFI_NOT_FOUND) + efivar_entry_list_del_unlock(entry); + else + efivar_unlock(); + + if (status && status != EFI_BUFFER_TOO_SMALL) + return efi_status_to_err(status); + + return 0; + +out: + efivar_unlock(); + return err; + +} + +/** + * efivar_entry_iter - iterate over variable list + * @func: callback function + * @head: head of variable list + * @data: function-specific data to pass to callback + * + * Iterate over the list of EFI variables and call @func with every + * entry on the list. It is safe for @func to remove entries in the + * list via efivar_entry_delete() while iterating. + * + * Some notes for the callback function: + * - a non-zero return value indicates an error and terminates the loop + * - @func is called from atomic context + */ +int efivar_entry_iter(int (*func)(struct efivar_entry *, void *), + struct list_head *head, void *data) +{ + struct efivar_entry *entry, *n; + int err = 0; + + err = efivar_lock(); + if (err) + return err; + + list_for_each_entry_safe(entry, n, head, list) { + err = func(entry, data); + if (err) + break; + } + efivar_unlock(); + + return err; +} diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 19e6c56a9f47..26fa170090b8 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -17,7 +17,7 @@ struct z_erofs_decompress_req { /* indicate the algorithm will be used for decompression */ unsigned int alg; - bool inplace_io, partial_decoding; + bool inplace_io, partial_decoding, fillgaps; }; struct z_erofs_decompressor { diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fbb037ba326e..fe8ac0e163f7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -366,42 +366,33 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return iomap_bmap(mapping, block, &erofs_iomap_ops); } -static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - loff_t align = iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; - else - blksize_mask = (1 << inode->i_blkbits) - 1; - if (align & blksize_mask) - return -EINVAL; - return 0; -} - -static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ /* no need taking (shared) inode lock since it's a ro filesystem */ if (!iov_iter_count(to)) return 0; #ifdef CONFIG_FS_DAX - if (IS_DAX(iocb->ki_filp->f_mapping->host)) + if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif if (iocb->ki_flags & IOCB_DIRECT) { - int err = erofs_prepare_dio(iocb, to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = bdev_logical_block_size(bdev) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if ((iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to)) & blksize_mask) + return -EINVAL; - if (!err) - return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0, NULL, 0); - if (err < 0) - return err; + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0, NULL, 0); } return filemap_read(iocb, to, 0); } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 6dca1900c733..2d55569f96ac 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -83,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, j = 0; /* 'valid' bounced can only be tested after a complete round */ - if (test_bit(j, bounced)) { + if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; @@ -91,14 +91,18 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, if (page) { __clear_bit(j, bounced); - if (kaddr) { - if (kaddr + PAGE_SIZE == page_address(page)) + if (!PageHighMem(page)) { + if (!i) { + kaddr = page_address(page); + continue; + } + if (kaddr && + kaddr + PAGE_SIZE == page_address(page)) { kaddr += PAGE_SIZE; - else - kaddr = NULL; - } else if (!i) { - kaddr = page_address(page); + continue; + } } + kaddr = NULL; continue; } kaddr = NULL; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 05a3063cf2bc..5e59b3f523eb 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -143,6 +143,7 @@ again: DBG_BUGON(z_erofs_lzma_head); z_erofs_lzma_head = head; spin_unlock(&z_erofs_lzma_lock); + wake_up_all(&z_erofs_lzma_wq); z_erofs_lzma_max_dictsize = dict_size; mutex_unlock(&lzma_resize_mutex); diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 18e59821c597..ecf28f66b97d 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name, } static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, - void *dentry_blk, unsigned int *ofs, + void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) { - struct erofs_dirent *de = dentry_blk + *ofs; const struct erofs_dirent *end = dentry_blk + nameoff; while (de < end) { @@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, /* stopped by some reason */ return 1; ++de; - *ofs += sizeof(struct erofs_dirent); + ctx->pos += sizeof(struct erofs_dirent); } - *ofs = maxsize; return 0; } @@ -90,33 +88,33 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) nameoff = le16_to_cpu(de->nameoff); if (nameoff < sizeof(struct erofs_dirent) || - nameoff >= PAGE_SIZE) { + nameoff >= EROFS_BLKSIZ) { erofs_err(dir->i_sb, "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; - goto skip_this; + break; } maxsize = min_t(unsigned int, - dirsize - ctx->pos + ofs, PAGE_SIZE); + dirsize - ctx->pos + ofs, EROFS_BLKSIZ); /* search dirents at the arbitrary position */ if (initial) { initial = false; ofs = roundup(ofs, sizeof(struct erofs_dirent)); + ctx->pos = blknr_to_addr(i) + ofs; if (ofs >= nameoff) goto skip_this; } - err = erofs_fill_dentries(dir, ctx, de, &ofs, + err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); -skip_this: - ctx->pos = blknr_to_addr(i) + ofs; - if (err) break; +skip_this: + ctx->pos = blknr_to_addr(i) + maxsize; ++i; ofs = 0; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 724bb57075f6..5792ca9e0d5e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud */ #include "zdata.h" #include "compress.h" @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage) +{ + if (iter->cur == iter->nr) { + if (!*candidate_bvpage) + return -EAGAIN; + + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = *candidate_bvpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + static void z_erofs_destroy_pcluster_pool(void) { int i; @@ -46,7 +123,7 @@ static int z_erofs_create_pcluster_pool(void) for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { - size = struct_size(a, compressed_pages, pcs->maxpages); + size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, @@ -150,30 +227,29 @@ int __init z_erofs_init_zip_subsystem(void) return err; } -enum z_erofs_collectmode { - COLLECT_SECONDARY, - COLLECT_PRIMARY, +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current collection was the tail of an exist chain, in addition - * that the previous processed chained collections are all decided to + * The current pclusters was the tail of an exist chain, in addition + * that the previous processed chained pclusters are all decided to * be hooked up to it. - * A new chain will be created for the remaining collections which are - * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, - * the next collection cannot reuse the whole page safely in - * the following scenario: + * A new chain will be created for the remaining pclusters which are + * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, + * the next pcluster cannot reuse the whole page safely for inplace I/O + * in the following scenario: * ________________________________________________________________ * | tail (partial) page | head (partial) page | - * | (belongs to the next cl) | (belongs to the current cl) | - * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + * | (belongs to the next pcl) | (belongs to the current pcl) | + * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| */ - COLLECT_PRIMARY_HOOKED, + Z_EROFS_PCLUSTER_HOOKED, /* - * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or - * pagevec) since it can be directly decoded without I/O submission. + * bvpage) since it can be directly decoded without I/O submission. */ - COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and * could also be linked with the remaining collections, which means @@ -184,39 +260,36 @@ enum z_erofs_collectmode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PRIMARY_FOLLOWED or | | - * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * | PCLUSTER_FOLLOWED or | | + * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ - COLLECT_PRIMARY_FOLLOWED, + Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; + struct z_erofs_bvec_iter biter; - struct z_erofs_pagevec_ctor vector; - + struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; - /* a pointer used to pick up inplace I/O pages */ - struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; - - enum z_erofs_collectmode mode; + enum z_erofs_pclustermode mode; bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; + + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } - -static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; -static DEFINE_MUTEX(z_pagemap_global_lock); + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, enum z_erofs_cache_alloctype type, @@ -231,24 +304,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - struct page **pages; - pgoff_t index; + unsigned int i; - if (fe->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - pages = pcl->compressed_pages; - index = pcl->obj.index; - for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; compressed_page_t t; struct page *newpage = NULL; /* the compressed page was loaded before */ - if (READ_ONCE(*pages)) + if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, index); + page = find_get_page(mc, pcl->obj.index + i); if (page) { t = tag_compressed_page_justfound(page); @@ -269,7 +339,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } } - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, + tagptr_cast_ptr(t))) continue; if (page) @@ -283,7 +354,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * managed cache since it can be moved to the bypass queue instead. */ if (standalone) - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -300,7 +371,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_pages[i]; + struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; @@ -313,7 +384,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, continue; /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); detach_page_private(page); unlock_page(page); } @@ -323,56 +394,59 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret = 0; /* 0 - busy */ + int ret, i; - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { - unsigned int i; + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) + return 0; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_pages[i] == page) { - WRITE_ONCE(pcl->compressed_pages[i], NULL); - ret = 1; - break; - } + ret = 0; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = 1; + break; } - erofs_workgroup_unfreeze(&pcl->obj, 1); - - if (ret) - detach_page_private(page); } + erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) + detach_page_private(page); return ret; } -/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct z_erofs_bvec *bvec) { struct z_erofs_pcluster *const pcl = fe->pcl; - while (fe->icpage_ptr > pcl->compressed_pages) - if (!cmpxchg(--fe->icpage_ptr, NULL, page)) + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; return true; + } + } return false; } /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct page *page, enum z_erofs_page_type type, - bool pvec_safereuse) + struct z_erofs_bvec *bvec, bool exclusive) { int ret; - /* give priority for inplaceio */ - if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(fe, page)) - return 0; - - ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, - pvec_safereuse); - fe->pcl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; + if (exclusive) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + fe->pcl->vcnt += (ret >= 0); + return ret; } static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) @@ -385,7 +459,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ - f->mode = COLLECT_PRIMARY_FOLLOWED; + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; return; } @@ -393,66 +467,21 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) * type 2, link to the end of an existing open chain, be careful * that its submission is controlled by the original attached chain. */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + if (*owned_head != &pcl->next && pcl != f->tailpcl && + cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = COLLECT_PRIMARY_HOOKED; + f->mode = Z_EROFS_PCLUSTER_HOOKED; f->tailpcl = NULL; return; } /* type 3, it belongs to a chain, but it isn't the end of the chain */ - f->mode = COLLECT_PRIMARY; + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } -static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) -{ - struct z_erofs_pcluster *pcl = fe->pcl; - unsigned int length; - - /* to avoid unexpected loop formed by corrupted images */ - if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - length = READ_ONCE(pcl->length); - if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { - if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - } else { - unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; - - if (map->m_flags & EROFS_MAP_FULL_MAPPED) - llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; - - while (llen > length && - length != cmpxchg_relaxed(&pcl->length, length, llen)) { - cpu_relax(); - length = READ_ONCE(pcl->length); - } - } - mutex_lock(&pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; - - z_erofs_try_to_claim_pcluster(fe); - return 0; -} - -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; @@ -471,14 +500,13 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, atomic_set(&pcl->obj.refcount, 1); pcl->algorithmformat = map->m_algorithmformat; - pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | - (map->m_flags & EROFS_MAP_FULL_MAPPED ? - Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + pcl->length = 0; + pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; - fe->mode = COLLECT_PRIMARY_FOLLOWED; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others @@ -494,7 +522,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { err = PTR_ERR(grp); goto err_out; @@ -520,11 +548,10 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { - struct erofs_workgroup *grp; + struct erofs_map_blocks *map = &fe->map; + struct erofs_workgroup *grp = NULL; int ret; DBG_BUGON(fe->pcl); @@ -533,38 +560,35 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (map->m_flags & EROFS_MAP_META) { - if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - goto tailpacking; + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(fe->inode->i_sb, + map->m_pa >> PAGE_SHIFT); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; } else { -tailpacking: - ret = z_erofs_register_pcluster(fe, inode, map); - if (!ret) - goto out; - if (ret != -EEXIST) - return ret; + ret = z_erofs_register_pcluster(fe); } - ret = z_erofs_lookup_pcluster(fe, inode, map); - if (ret) { - erofs_workgroup_put(&fe->pcl->obj); + if (ret == -EEXIST) { + mutex_lock(&fe->pcl->lock); + /* used to check tail merging loop due to corrupted images */ + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = fe->pcl; + + z_erofs_try_to_claim_pcluster(fe); + } else if (ret) { return ret; } - -out: - z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->pcl->pagevec, fe->pcl->vcnt); + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ - fe->icpage_ptr = fe->pcl->compressed_pages + - z_erofs_pclusterpages(fe->pcl); + fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -593,14 +617,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) if (!pcl) return false; - z_erofs_pagevec_ctor_exit(&fe->vector, false); + z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); + if (fe->candidate_bvpage) { + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + fe->candidate_bvpage = NULL; + } + /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; @@ -628,11 +657,10 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); - bool tight = true; + bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - enum z_erofs_page_type page_type; - unsigned int cur, end, spiltted, index; + unsigned int cur, end, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -653,7 +681,7 @@ repeat: map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) - goto err_out; + goto out; } else { if (fe->pcl) goto hitted; @@ -663,9 +691,9 @@ repeat: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; - err = z_erofs_collector_begin(fe, inode, map); + err = z_erofs_collector_begin(fe); if (err) - goto err_out; + goto out; if (z_erofs_is_inline_pcluster(fe->pcl)) { void *mp; @@ -676,11 +704,12 @@ repeat: err = PTR_ERR(mp); erofs_err(inode->i_sb, "failed to get inline page, err %d", err); - goto err_out; + goto out; } get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, + fe->map.buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, @@ -696,10 +725,10 @@ hitted: * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or pagevec (should be processed in strict order.) + * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && - fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && + fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -707,60 +736,59 @@ hitted: goto next_part; } - /* let's derive page type */ - page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : - (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); - + exclusive = (!cur && (!spiltted || tight)); if (cur) - tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); retry: - err = z_erofs_attach_page(fe, page, page_type, - fe->mode >= COLLECT_PRIMARY_FOLLOWED); - /* should allocate an additional short-lived page for pagevec */ - if (err == -EAGAIN) { - struct page *const newpage = - alloc_page(GFP_NOFS | __GFP_NOFAIL); - - set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); - err = z_erofs_attach_page(fe, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); - if (!err) - goto retry; + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), exclusive); + /* should allocate an additional short-lived page for bvset */ + if (err == -EAGAIN && !fe->candidate_bvpage) { + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); + set_page_private(fe->candidate_bvpage, + Z_EROFS_SHORTLIVED_PAGE); + goto retry; } - if (err) - goto err_out; - - index = page->index - (map->m_la >> PAGE_SHIFT); - - z_erofs_onlinepage_fixup(page, index, true); + if (err) { + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + goto out; + } + z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; - /* also update nr_pages */ - fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); + if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) + fe->pcl->multibases = true; + + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } next_part: - /* can be used for verification */ + /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) goto repeat; out: + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", __func__, page, spiltted, map->m_llen); return err; - - /* if some error occurred while processing this page */ -err_out: - SetPageError(page); - goto out; } static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, @@ -783,97 +811,137 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } -static int z_erofs_decompress_pcluster(struct super_block *sb, - struct z_erofs_pcluster *pcl, - struct page **pagepool) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct z_erofs_pagevec_ctor ctor; - unsigned int i, inputsize, outputsize, llen, nr_pages; - struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; - struct page **pages, **compressed_pages, *page; +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; - enum z_erofs_page_type page_type; - bool overlapped, partial; - int err; + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; - might_sleep(); - DBG_BUGON(!READ_ONCE(pcl->nr_pages)); + struct list_head decompressed_secondary_bvecs; + struct page **pagepool; + unsigned int onstack_used, nr_pages; +}; - mutex_lock(&pcl->lock); - nr_pages = pcl->nr_pages; +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { - pages = pages_onstack; - } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && - mutex_trylock(&z_pagemap_global_lock)) { - pages = z_pagemap_global; - } else { - gfp_t gfp_flags = GFP_KERNEL; +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) +{ + struct z_erofs_bvec_item *item; - if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) - gfp_flags |= __GFP_NOFAIL; + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { + unsigned int pgnr; + struct page *oldpage; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - gfp_flags); + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; - /* fallback to global pagemap for the lowmem scenario */ - if (!pages) { - mutex_lock(&z_pagemap_global_lock); - pages = z_pagemap_global; - } + if (!oldpage) + return; } - for (i = 0; i < nr_pages; ++i) - pages[i] = NULL; - - err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); - - for (i = 0; i < pcl->vcnt; ++i) { - unsigned int pagenr; + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} - page = z_erofs_pagevec_dequeue(&ctor, &page_type); +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; + } + kunmap_local(dst); + if (err) + z_erofs_page_mark_eio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page); + list_del(p); + kfree(bvi); + } +} - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +{ + struct z_erofs_pcluster *pcl = be->pcl; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; + int i; - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct z_erofs_bvec bvec; - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); - DBG_BUGON(pagenr >= nr_pages); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); + z_erofs_do_decompressed_bvec(be, &bvec); } - z_erofs_pagevec_ctor_exit(&ctor, true); - overlapped = false; - compressed_pages = pcl->compressed_pages; + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); +} +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) +{ + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i, err = 0; + + *overlapped = false; for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; - page = compressed_pages[i]; - /* all compressed pages ought to be valid */ - DBG_BUGON(!page); + /* compressed pages ought to be present before decompressing */ + if (!page) { + DBG_BUGON(1); + continue; + } + be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl)) { if (!PageUptodate(page)) @@ -883,109 +951,129 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; } + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; + } + } - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + if (err) + return err; + return 0; +} - overlapped = true; - } +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) +{ + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + unsigned int i, inputsize; + int err2; + struct page *page; + bool overlapped; - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } + mutex_lock(&pcl->lock); + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; + + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + be->onstack_used = be->nr_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * be->nr_pages); } + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) + be->decompressed_pages = + kvcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + + z_erofs_parse_out_bvecs(be); + err2 = z_erofs_parse_in_bvecs(be, &overlapped); + if (err2) + err = err2; if (err) goto out; - llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { - outputsize = llen; - partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); - } else { - outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; - partial = true; - } - if (z_erofs_is_inline_pcluster(pcl)) inputsize = pcl->tailpacking_size; else inputsize = pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { - .sb = sb, - .in = compressed_pages, - .out = pages, + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, - .outputsize = outputsize, + .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, - .partial_decoding = partial - }, pagepool); + .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, + }, be->pagepool); out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = compressed_pages[0]; - WRITE_ONCE(compressed_pages[0], NULL); + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = compressed_pages[i]; + page = pcl->compressed_bvecs[i].page; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(compressed_pages[i], NULL); + (void)z_erofs_put_shortlivedpage(be->pagepool, page); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); - for (i = 0; i < nr_pages; ++i) { - page = pages[i]; + for (i = 0; i < be->nr_pages; ++i) { + page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(pagepool, page)) + if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - - if (err < 0) - SetPageError(page); - + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } - if (pages == z_pagemap_global) - mutex_unlock(&z_pagemap_global_lock); - else if (pages != pages_onstack) - kvfree(pages); + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); - pcl->nr_pages = 0; + pcl->length = 0; + pcl->partial = true; + pcl->multibases = false; + pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ @@ -997,22 +1085,25 @@ out: static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - struct z_erofs_pcluster *pcl; - - /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - - /* no possible that 'owned' equals NULL */ + /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(pcl->next); + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool); - erofs_workgroup_put(&pcl->obj); + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + erofs_workgroup_put(&be.pcl->obj); } } @@ -1038,7 +1129,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); - return; } @@ -1071,7 +1161,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, int justfound; repeat: - page = READ_ONCE(pcl->compressed_pages[nr]); + page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) @@ -1087,7 +1177,7 @@ repeat: * otherwise, it will go inplace I/O path instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); tocache = true; goto out_tocache; @@ -1113,14 +1203,13 @@ repeat: /* the page is still in manage cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if - * the page is already in compressed_pages[]. + * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); @@ -1149,7 +1238,8 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; @@ -1186,6 +1276,7 @@ fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); + q->eio = false; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1246,26 +1337,25 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (err) - SetPageError(page); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); unlock_page(page); } } + if (err) + q->eio = true; z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } -static void z_erofs_submit_queue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; @@ -1317,7 +1407,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct page *page; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi)); + mc); if (!page) continue; @@ -1369,15 +1459,14 @@ submit_bio_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); } -static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1475,7 +1564,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) @@ -1524,7 +1613,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, nr_pages)); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 58053bb5066f..e7f04c4fbb81 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -7,13 +7,10 @@ #define __EROFS_FS_ZDATA_H #include "internal.h" -#include "zpvec.h" +#include "tagptr.h" #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_NR_INLINE_PAGEVECS 3 - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 +#define Z_EROFS_INLINE_BVECS 2 /* * let's leave a type here in case of introducing @@ -21,6 +18,21 @@ */ typedef void *z_erofs_next_pcluster_t; +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + /* * Structure fields follow one of the following exclusion rules. * @@ -38,24 +50,21 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: lower limit of decompressed length and if full length or not */ + /* L: the maximum decompression size of this round */ unsigned int length; + /* L: total number of bvecs */ + unsigned int vcnt; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in pagevec[] */ - unsigned short nr_pages; - - /* L: total number of pages in pagevec[] */ - unsigned int vcnt; - union { - /* L: inline a certain number of pagevecs for bootstrap */ - erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; @@ -72,8 +81,14 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; - /* A: compressed pages (can be cached or inplaced pages) */ - struct page *compressed_pages[]; + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; }; /* let's avoid the valid 32-bit kernel addresses */ @@ -94,6 +109,8 @@ struct z_erofs_decompressqueue { struct completion done; struct work_struct work; } u; + + bool eio; }; static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) @@ -108,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return pcl->pclusterpages; } -#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 -#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) -#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) - /* - * waiters (aka. ongoing_packs): # to unlock the page - * sub-index: 0 - for partial page, >= 1 full page sub-index + * bit 31: I/O error occurred on this page + * bit 0 - 30: remaining parts to complete this page */ -typedef atomic_t z_erofs_onlinepage_t; - -/* type punning */ -union z_erofs_onlinepage_converter { - z_erofs_onlinepage_t *o; - unsigned long *v; -}; - -static inline unsigned int z_erofs_onlinepage_index(struct page *page) -{ - union z_erofs_onlinepage_converter u; - - DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; -} +#define Z_EROFS_PAGE_EIO (1 << 31) static inline void z_erofs_onlinepage_init(struct page *page) { union { - z_erofs_onlinepage_t o; + atomic_t o; unsigned long v; - /* keep from being unlocked in advance */ } u = { .o = ATOMIC_INIT(1) }; set_page_private(page, u.v); @@ -147,49 +143,36 @@ static inline void z_erofs_onlinepage_init(struct page *page) SetPagePrivate(page); } -static inline void z_erofs_onlinepage_fixup(struct page *page, - uintptr_t index, bool down) +static inline void z_erofs_onlinepage_split(struct page *page) { - union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; - int orig, orig_index, val; - -repeat: - orig = atomic_read(u.o); - orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (orig_index) { - if (!index) - return; + atomic_inc((atomic_t *)&page->private); +} - DBG_BUGON(orig_index != index); - } +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; - val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (atomic_cmpxchg(u.o, orig, val) != orig) - goto repeat; + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); } static inline void z_erofs_onlinepage_endio(struct page *page) { - union z_erofs_onlinepage_converter u; unsigned int v; DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - v = atomic_dec_return(u.o); - if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); - if (!PageError(page)) + if (!(v & Z_EROFS_PAGE_EIO)) SetPageUptodate(page); unlock_page(page); } - erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); } -#define Z_EROFS_VMAP_ONSTACK_PAGES \ - min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) -#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 +#define Z_EROFS_ONSTACK_PAGES 32 #endif diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h deleted file mode 100644 index b05464f4a808..000000000000 --- a/fs/erofs/zpvec.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZPVEC_H -#define __EROFS_FS_ZPVEC_H - -#include "tagptr.h" - -/* page type in pagevec for decompress subsystem */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - -extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") - __bad_page_type_exclusive(void); - -/* pagevec tagged pointer */ -typedef tagptr2_t erofs_vtptr_t; - -/* pagevec collector */ -struct z_erofs_pagevec_ctor { - struct page *curr, *next; - erofs_vtptr_t *pages; - - unsigned int nr, index; -}; - -static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - if (!ctor->curr) - return; - - if (atomic) - kunmap_atomic(ctor->pages); - else - kunmap(ctor->curr); -} - -static inline struct page * -z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr) -{ - unsigned int index; - - /* keep away from occupied pages */ - if (ctor->next) - return ctor->next; - - for (index = 0; index < nr; ++index) { - const erofs_vtptr_t t = ctor->pages[index]; - const unsigned int tags = tagptr_unfold_tags(t); - - if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) - return tagptr_unfold_ptr(t); - } - DBG_BUGON(nr >= ctor->nr); - return NULL; -} - -static inline void -z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); - - z_erofs_pagevec_ctor_exit(ctor, atomic); - - ctor->curr = next; - ctor->next = NULL; - ctor->pages = atomic ? - kmap_atomic(ctor->curr) : kmap(ctor->curr); - - ctor->nr = PAGE_SIZE / sizeof(struct page *); - ctor->index = 0; -} - -static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr, - erofs_vtptr_t *pages, - unsigned int i) -{ - ctor->nr = nr; - ctor->curr = ctor->next = NULL; - ctor->pages = pages; - - if (i >= nr) { - i -= nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - while (i > ctor->nr) { - i -= ctor->nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - } - } - ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); - ctor->index = i; -} - -static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, - struct page *page, - enum z_erofs_page_type type, - bool pvec_safereuse) -{ - if (!ctor->next) { - /* some pages cannot be reused as pvec safely without I/O */ - if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) - type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; - - if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && - ctor->index + 1 == ctor->nr) - return false; - } - - if (ctor->index >= ctor->nr) - z_erofs_pagevec_ctor_pagedown(ctor, false); - - /* exclusive page type must be 0 */ - if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) - __bad_page_type_exclusive(); - - /* should remind that collector->next never equal to 1, 2 */ - if (type == (uintptr_t)ctor->next) { - ctor->next = page; - } - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); - return true; -} - -static inline struct page * -z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, - enum z_erofs_page_type *type) -{ - erofs_vtptr_t t; - - if (ctor->index >= ctor->nr) { - DBG_BUGON(!ctor->next); - z_erofs_pagevec_ctor_pagedown(ctor, true); - } - - t = ctor->pages[ctor->index]; - - *type = tagptr_unfold_tags(t); - - /* should remind that collector->next never equal to 1, 2 */ - if (*type == (uintptr_t)ctor->next) - ctor->next = tagptr_unfold_ptr(t); - - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); - return tagptr_unfold_ptr(t); -} -#endif diff --git a/fs/exec.c b/fs/exec.c index 0989fb8472a1..5fd73915c62c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> +#include <linux/time_namespace.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -630,7 +631,6 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; - char *kaddr; pos -= bytes_to_copy; arg -= bytes_to_copy; @@ -639,11 +639,8 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; - kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); - memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); - flush_dcache_page(page); - kunmap_atomic(kaddr); + memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } @@ -982,10 +979,12 @@ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; + bool vfork; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; + vfork = !!tsk->vfork_done; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) @@ -1030,6 +1029,10 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); + + if (vfork) + timens_on_fork(tsk->nsproxy, tsk); + if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); @@ -1149,7 +1152,7 @@ static int de_thread(struct task_struct *tsk) /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees - * the tracer wont't block again waiting for this thread. + * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); @@ -1301,7 +1304,7 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS - exit_itimers(me->signal); + exit_itimers(me); flush_itimer_signals(); #endif diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 76acc3721951..c6eaf7e9ea74 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -1198,7 +1198,9 @@ static int __exfat_rename(struct inode *old_parent_inode, return -ENOENT; } - exfat_chain_dup(&olddir, &ei->dir); + exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi), + EXFAT_I(old_parent_inode)->flags); dentry = ei->entry; ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 2c2f179b6977..8f597753ac12 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -200,19 +200,19 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n, int quiet, void **page_addr) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) { - *page_addr = kmap_local_page(page); - if (unlikely(!PageChecked(page))) { - if (PageError(page) || !ext2_check_page(page, quiet, - *page_addr)) - goto fail; - } + struct folio *folio = read_mapping_folio(mapping, n, NULL); + + if (IS_ERR(folio)) + return &folio->page; + *page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1)); + if (unlikely(!folio_test_checked(folio))) { + if (!ext2_check_page(&folio->page, quiet, *page_addr)) + goto fail; } - return page; + return &folio->page; fail: - ext2_put_page(page, *page_addr); + ext2_put_page(&folio->page, *page_addr); return ERR_PTR(-EIO); } @@ -672,17 +672,14 @@ int ext2_empty_dir (struct inode * inode) void *page_addr = NULL; struct page *page = NULL; unsigned long i, npages = dir_pages(inode); - int dir_has_error = 0; for (i = 0; i < npages; i++) { char *kaddr; ext2_dirent * de; - page = ext2_get_page(inode, i, dir_has_error, &page_addr); + page = ext2_get_page(inode, i, 0, &page_addr); - if (IS_ERR(page)) { - dir_has_error = 1; - continue; - } + if (IS_ERR(page)) + goto not_empty; kaddr = page_addr; de = (ext2_dirent *)kaddr; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index d4f306aa5ace..28de11a22e5f 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -795,7 +795,6 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern void ext2_set_file_ops(struct inode *inode); extern const struct address_space_operations ext2_aops; -extern const struct address_space_operations ext2_nobh_aops; extern const struct iomap_ops ext2_iomap_ops; /* namei.c */ diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 360ce3604a2d..918ab2f9e4c0 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -908,25 +908,6 @@ static int ext2_write_end(struct file *file, struct address_space *mapping, return ret; } -static int -ext2_nobh_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) -{ - int ret; - - ret = nobh_write_begin(mapping, pos, len, pagep, fsdata, - ext2_get_block); - if (ret < 0) - ext2_write_failed(mapping, pos + len); - return ret; -} - -static int ext2_nobh_writepage(struct page *page, - struct writeback_control *wbc) -{ - return nobh_writepage(page, ext2_get_block, wbc); -} - static sector_t ext2_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,ext2_get_block); @@ -973,26 +954,11 @@ const struct address_space_operations ext2_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; -const struct address_space_operations ext2_nobh_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = ext2_read_folio, - .readahead = ext2_readahead, - .writepage = ext2_nobh_writepage, - .write_begin = ext2_nobh_write_begin, - .write_end = nobh_write_end, - .bmap = ext2_bmap, - .direct_IO = ext2_direct_IO, - .writepages = ext2_writepages, - .migratepage = buffer_migrate_page, - .error_remove_page = generic_error_remove_page, -}; - static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, @@ -1298,13 +1264,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (IS_DAX(inode)) { + if (IS_DAX(inode)) error = dax_zero_range(inode, newsize, PAGE_ALIGN(newsize) - newsize, NULL, &ext2_iomap_ops); - } else if (test_opt(inode->i_sb, NOBH)) - error = nobh_truncate_page(inode->i_mapping, - newsize, ext2_get_block); else error = block_truncate_page(inode->i_mapping, newsize, ext2_get_block); @@ -1396,8 +1359,6 @@ void ext2_set_file_ops(struct inode *inode) inode->i_fop = &ext2_file_operations; if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext2_dax_aops; - else if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; else inode->i_mapping->a_ops = &ext2_aops; } @@ -1497,10 +1458,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISLNK(inode->i_mode)) { if (ext2_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; @@ -1510,10 +1468,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } else { inode->i_op = &ext2_symlink_inode_operations; inode_nohighmem(inode); - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; } } else { inode->i_op = &ext2_special_inode_operations; @@ -1549,7 +1504,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) if (IS_ERR(raw_inode)) return -EIO; - /* For fields not not tracking in the in-memory inode, + /* For fields not tracking in the in-memory inode, * initialise them to zero for new inodes. */ if (ei->i_state & EXT2_STATE_NEW) memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); @@ -1679,14 +1634,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { error = dquot_initialize(inode); if (error) return error; } - if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || - (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - error = dquot_transfer(inode, iattr); + if (i_uid_needs_update(mnt_userns, iattr, inode) || + i_gid_needs_update(mnt_userns, iattr, inode)) { + error = dquot_transfer(mnt_userns, inode, iattr); if (error) return error; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5f6b7560eb3f..5fd9a22d2b70 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -178,10 +178,7 @@ static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir, /* slow symlink */ inode->i_op = &ext2_symlink_inode_operations; inode_nohighmem(inode); - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; err = page_symlink(inode, symname, l); if (err) goto out_fail; @@ -247,10 +244,7 @@ static int ext2_mkdir(struct user_namespace * mnt_userns, inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; inode_inc_link_count(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..27a0a8c74f7a 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -296,9 +296,6 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noacl"); #endif - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); - if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); @@ -551,7 +548,8 @@ static int parse_options(char *options, struct super_block *sb, clear_opt (opts->s_mount_opt, OLDALLOC); break; case Opt_nobh: - set_opt (opts->s_mount_opt, NOBH); + ext2_msg(sb, KERN_INFO, + "nobh option not supported"); break; #ifdef CONFIG_EXT2_FS_XATTR case Opt_user_xattr: @@ -1059,9 +1057,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_frags_per_group); goto failed_mount; } - if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, - "error: #inodes per group too big: %lu", + "error: invalid #inodes per group: %lu", sbi->s_inodes_per_group); goto failed_mount; } @@ -1071,6 +1070,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group != + le32_to_cpu(es->s_inodes_count)) { + ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu", + le32_to_cpu(es->s_inodes_count), + (u64)sbi->s_groups_count * sbi->s_inodes_per_group); + goto failed_mount; + } db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc_array(db_count, @@ -1490,8 +1496,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; @@ -1529,8 +1534,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head *bh; while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 75b8d81b2469..29fc575a4eb6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3058,14 +3058,14 @@ extern unsigned int ext4_list_backups(struct super_block *sb, /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, - sector_t block, int op_flags); + sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); -extern void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, +extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io); -extern int ext4_read_bh(struct buffer_head *bh, int op_flags, +extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io); -extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait); +extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 795a60ad1897..eb4c8ad1bb61 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -658,7 +658,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { - int write_flags = REQ_SYNC; + blk_opf_t write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ @@ -668,7 +668,7 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE, write_flags, bh); + submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3dce7d058985..9fd60fc8ba4c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -829,7 +829,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); } /* Maximum number of blocks we map for direct IO at once. */ @@ -1554,9 +1554,9 @@ struct mpage_da_data { static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { - int nr_pages, i; + unsigned nr, i; pgoff_t index, end; - struct pagevec pvec; + struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; @@ -1574,15 +1574,18 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, ext4_es_remove_extent(inode, start, last - start + 1); } - pagevec_init(&pvec); + folio_batch_init(&fbatch); while (index <= end) { - nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); - if (nr_pages == 0) + nr = filemap_get_folios(mapping, &index, end, &fbatch); + if (nr == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct folio *folio = page_folio(page); + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + if (folio->index < mpd->first_page) + continue; + if (folio->index + folio_nr_pages(folio) - 1 > end) + continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { @@ -1594,7 +1597,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, } folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); } } @@ -2311,8 +2314,8 @@ out: */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { - struct pagevec pvec; - int nr_pages, i; + struct folio_batch fbatch; + unsigned nr, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; @@ -2326,14 +2329,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) lblk = start << bpp_bits; pblock = mpd->map.m_pblk; - pagevec_init(&pvec); + folio_batch_init(&fbatch); while (start <= end) { - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, - &start, end); - if (nr_pages == 0) + nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); + if (nr == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr; i++) { + struct page *page = &fbatch.folios[i]->page; err = mpage_process_page(mpd, page, &lblk, &pblock, &map_bh); @@ -2349,14 +2351,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (err < 0) goto out; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: - pagevec_release(&pvec); + folio_batch_release(&fbatch); return err; } @@ -3631,7 +3633,7 @@ static const struct address_space_operations ext4_aops = { .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -3666,7 +3668,7 @@ static const struct address_space_operations ext4_da_aops = { .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -5350,14 +5352,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } - if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || - (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -5374,7 +5376,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { @@ -5383,10 +5385,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Update corresponding info in inode so that everything is in * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9f12f29bc346..9e06334771a3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4104,6 +4104,15 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = size >> bsbits; start = start_off >> bsbits; + /* + * For tiny groups (smaller than 8MB) the chosen allocation + * alignment may be larger than group size. Make sure the + * alignment does not move allocation to a different group which + * makes mballoc fail assertions later. + */ + start = max(start, rounddown(ac->ac_o_ex.fe_logical, + (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); + /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; @@ -4176,7 +4185,22 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } rcu_read_unlock(); - if (start + size <= ac->ac_o_ex.fe_logical && + /* + * In this function "start" and "size" are normalized for better + * alignment and length such that we could preallocate more blocks. + * This normalization is done such that original request of + * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and + * "size" boundaries. + * (Note fe_len can be relaxed since FS block allocation API does not + * provide gurantee on number of contiguous blocks allocation since that + * depends upon free space left, etc). + * In case of inode pa, later we use the allocated blocks + * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated + * range of goal/best blocks [start, size] to put it at the + * ac_o_ex.fe_logical extent of this inode. + * (See ext4_mb_use_inode_pa() for more details) + */ + if (start + size <= ac->ac_o_ex.fe_logical || start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 7a5353a8cfd7..42f590518b4c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -438,7 +438,7 @@ int ext4_ext_migrate(struct inode *inode) /* * Worst case we can touch the allocation bitmaps and a block - * group descriptor block. We do need need to worry about + * group descriptor block. We do need to worry about * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 79d05e464c43..9af68a7ecdcf 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -150,8 +150,6 @@ static int kmmpd(void *data) mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, EXT4_MMP_MIN_CHECK_INTERVAL); mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); - BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE); - bdevname(bh->b_bdev, mmp->mmp_bdevname); memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); @@ -372,13 +370,16 @@ skip: EXT4_SB(sb)->s_mmp_bh = bh; + BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE); + snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname), + "%pg", bh->b_bdev); + /* * Start a kernel thread to update the MMP block periodically. */ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s", (int)sizeof(mmp->mmp_bdevname), - bdevname(bh->b_bdev, - mmp->mmp_bdevname)); + mmp->mmp_bdevname); if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { EXT4_SB(sb)->s_mmp_tsk = NULL; ext4_warning(sb, "Unable to create kmmpd thread for %s.", diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 47d0ca4c795b..db4ba99d1ceb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1929,7 +1929,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count, continued; + unsigned continued; + int count; struct buffer_head *bh2; ext4_lblk_t newblock; u32 hash2; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 14695e2b5042..97fa7b4c645f 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -465,7 +465,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* * In the first loop we prepare and mark buffers to submit. We have to * mark all buffers in the page before submitting so that - * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * end_page_writeback() cannot be called from ext4_end_bio() when IO * on the first buffer finishes and we are still working on submitting * the second buffer. */ diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 90a941d20dff..8b70a4701293 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -54,6 +54,16 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; /* + * If the reserved GDT blocks is non-zero, the resize_inode feature + * should always be set. + */ + if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && + !ext4_has_feature_resize_inode(sb)) { + ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); + return -EFSCORRUPTED; + } + + /* * If we are not using the primary superblock/GDT copy don't resize, * because the user tools have no way of handling this. Probably a * bad time to do it anyways. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 450c918d68fc..2c68dec63e54 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -87,7 +87,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); -static int ext4_apply_options(struct fs_context *fc, struct super_block *sb); +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); @@ -159,7 +159,7 @@ MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) -static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags, +static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { /* @@ -171,10 +171,10 @@ static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags, bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); - submit_bh(REQ_OP_READ, op_flags, bh); + submit_bh(REQ_OP_READ | op_flags, bh); } -void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, +void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); @@ -186,7 +186,7 @@ void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, __ext4_read_bh(bh, op_flags, end_io); } -int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io) +int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); @@ -203,7 +203,7 @@ int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io) return -EIO; } -int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait) +int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { if (trylock_buffer(bh)) { if (wait) @@ -227,8 +227,8 @@ int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait) * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, - sector_t block, int op_flags, - gfp_t gfp) + sector_t block, + blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; @@ -248,7 +248,7 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, - int op_flags) + blk_opf_t op_flags) { return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); } @@ -1870,31 +1870,12 @@ ext4_sb_read_encoding(const struct ext4_super_block *es) } #endif -static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) -{ -#ifdef CONFIG_FS_ENCRYPTION - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - - err = fscrypt_set_test_dummy_encryption(sb, arg, - &sbi->s_dummy_enc_policy); - if (err) { - ext4_msg(sb, KERN_WARNING, - "Error while setting test dummy encryption [%d]", err); - return err; - } - ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); -#endif - return 0; -} - #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) -#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) @@ -1911,7 +1892,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; - char *test_dummy_enc_arg; + struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; @@ -1953,7 +1934,7 @@ static void ext4_fc_free(struct fs_context *fc) for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); - kfree(ctx->test_dummy_enc_arg); + fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } @@ -2029,6 +2010,29 @@ static int unnote_qf_name(struct fs_context *fc, int qtype) } #endif +static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, + struct ext4_fs_context *ctx) +{ + int err; + + if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { + ext4_msg(NULL, KERN_WARNING, + "test_dummy_encryption option not supported"); + return -EINVAL; + } + err = fscrypt_parse_test_dummy_encryption(param, + &ctx->dummy_enc_policy); + if (err == -EINVAL) { + ext4_msg(NULL, KERN_WARNING, + "Value of option \"%s\" is unrecognized", param->key); + } else if (err == -EEXIST) { + ext4_msg(NULL, KERN_WARNING, + "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + return err; +} + #define EXT4_SET_CTX(name) \ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ @@ -2291,29 +2295,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (param->type == fs_value_is_flag) { - ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; - ctx->test_dummy_enc_arg = NULL; - return 0; - } - if (*param->string && - !(!strcmp(param->string, "v1") || - !strcmp(param->string, "v2"))) { - ext4_msg(NULL, KERN_WARNING, - "Value of option \"%s\" is unrecognized", - param->key); - return -EINVAL; - } - ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; - ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size, - GFP_KERNEL); - return 0; -#else - ext4_msg(NULL, KERN_WARNING, - "test_dummy_encryption option not supported"); - return -EINVAL; -#endif + return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX @@ -2504,7 +2486,8 @@ parse_failed: if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; - ret = ext4_apply_options(fc, sb); + ext4_apply_options(fc, sb); + ret = 0; out_free: if (fc) { @@ -2673,11 +2656,11 @@ err_jquota_specified: static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { -#ifdef CONFIG_FS_ENCRYPTION const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; - if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)) + if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { @@ -2691,14 +2674,46 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && - !sbi->s_dummy_enc_policy.policy) { + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, + &ctx->dummy_enc_policy)) + return 0; ext4_msg(NULL, KERN_WARNING, - "Can't set test_dummy_encryption on remount"); + "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } -#endif /* CONFIG_FS_ENCRYPTION */ - return 0; + /* Also make sure s_mount_opts didn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, + &ctx->dummy_enc_policy)) + return 0; + ext4_msg(NULL, KERN_WARNING, + "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + /* + * fscrypt_add_test_dummy_key() technically changes the super_block, so + * technically it should be delayed until ext4_apply_options() like the + * other changes. But since we never get here for remounts (see above), + * and this is the last chance to report errors, we do it here. + */ + err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy); + if (err) + ext4_msg(NULL, KERN_WARNING, + "Error adding test dummy encryption key [%d]", err); + return err; +} + +static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, + struct super_block *sb) +{ + if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || + /* if already set, it was already verified to be the same */ + fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) + return; + EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; + memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, @@ -2785,11 +2800,10 @@ fail_dax_change_remount: return ext4_check_quota_consistency(fc, sb); } -static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) +static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; - int ret = 0; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; @@ -2825,11 +2839,7 @@ static int ext4_apply_options(struct fs_context *fc, struct super_block *sb) #endif ext4_apply_quota_options(fc, sb); - - if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) - ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg); - - return ret; + ext4_apply_test_dummy_encryption(ctx, sb); } @@ -4552,9 +4562,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err < 0) goto failed_mount; - err = ext4_apply_options(fc, sb); - if (err < 0) - goto failed_mount; + ext4_apply_options(fc, sb); #if IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { @@ -5302,14 +5310,6 @@ no_journal: err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } - /* - * Update the checksum after updating free space/inode - * counters. Otherwise the superblock can have an incorrect - * checksum in the buffer cache until it is written out and - * e2fsprogs programs trying to open a file system immediately - * after it is mounted can fail. - */ - ext4_superblock_csum_set(sb); if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb), GFP_KERNEL); @@ -5367,6 +5367,14 @@ no_journal: EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; + /* + * Update the checksum after updating free space/inode counters and + * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); @@ -5898,7 +5906,6 @@ static void ext4_update_super(struct super_block *sb) static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; - int error = 0; if (!sbh) return -EINVAL; @@ -5907,6 +5914,13 @@ static int ext4_commit_super(struct super_block *sb) ext4_update_super(sb); + lock_buffer(sbh); + /* Buffer got discarded which means block device got invalidated */ + if (!buffer_mapped(sbh)) { + unlock_buffer(sbh); + return -EIO; + } + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -5921,17 +5935,21 @@ static int ext4_commit_super(struct super_block *sb) clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } - BUFFER_TRACE(sbh, "marking dirty"); - mark_buffer_dirty(sbh); - error = __sync_dirty_buffer(sbh, - REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); + get_bh(sbh); + /* Clear potential dirty bit if it was journalled update */ + clear_buffer_dirty(sbh); + sbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE | REQ_SYNC | + (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); + wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); + return -EIO; } - return error; + return 0; } /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 042325349098..564e28a1aa94 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1895,11 +1895,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; - memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); s->first = ENTRY(header(s->base)+1); header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6d8b2bf14de0..8259e0fa97e1 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -463,9 +463,7 @@ const struct address_space_operations f2fs_meta_aops = { .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif + .migrate_folio = filemap_migrate_folio, }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 24824cd96f36..009e6c519e98 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1832,45 +1832,40 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { struct address_space *mapping = sbi->compress_inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); if (!mapping->nrpages) return; - pagevec_init(&pvec); + folio_batch_init(&fbatch); do { - unsigned int nr_pages; - int i; + unsigned int nr, i; - nr_pages = pagevec_lookup_range(&pvec, mapping, - &index, end - 1); - if (!nr_pages) + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); + if (!nr) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (page->index > end) - break; + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); continue; } - if (ino != get_page_private_data(page)) { - unlock_page(page); + if (ino != get_page_private_data(&folio->page)) { + folio_unlock(folio); continue; } - generic_error_remove_page(mapping, page); - unlock_page(page); + generic_error_remove_page(mapping, &folio->page); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } while (index < end); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7fcbcf979737..ed503d50f95f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -387,11 +387,11 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) +static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; unsigned int fua_flag, meta_flag, io_flag; - unsigned int op_flags = 0; + blk_opf_t op_flags = 0; if (fio->op != REQ_OP_WRITE) return 0; @@ -999,7 +999,7 @@ out: } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages, unsigned op_flag, + unsigned nr_pages, blk_opf_t op_flag, pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1047,7 +1047,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, - block_t blkaddr, int op_flags, bool for_write) + block_t blkaddr, blk_opf_t op_flags, + bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1181,7 +1182,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write) + blk_opf_t op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -3751,42 +3752,6 @@ out: return blknr; } -#ifdef CONFIG_MIGRATION -#include <linux/migrate.h> - -int f2fs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) -{ - int rc, extra_count = 0; - - BUG_ON(PageWriteback(page)); - - rc = migrate_page_move_mapping(mapping, newpage, - page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - /* guarantee to start from no stale private field */ - set_page_private(newpage, 0); - if (PagePrivate(page)) { - set_page_private(newpage, page_private(page)); - SetPagePrivate(newpage); - get_page(newpage); - - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } - - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); - - return MIGRATEPAGE_SUCCESS; -} -#endif - #ifdef CONFIG_SWAP static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int blkcnt) @@ -4018,15 +3983,13 @@ const struct address_space_operations f2fs_dblock_aops = { .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, .dirty_folio = f2fs_dirty_data_folio, + .migrate_folio = filemap_migrate_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif }; void f2fs_clear_page_cache_dirty_tag(struct page *page) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d9bbecd008d2..85921a1f2db8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1183,8 +1183,8 @@ struct f2fs_io_info { nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ - int op; /* contains REQ_OP_ */ - int op_flags; /* req_flag_bits */ + enum req_op op; /* contains REQ_OP_ */ + blk_opf_t op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ @@ -3741,7 +3741,7 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write); + blk_opf_t op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); @@ -3764,10 +3764,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool f2fs_release_folio(struct folio *folio, gfp_t wait); -#ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode); -#endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct page *page); int f2fs_init_post_read_processing(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bd14cef1b08f..d66e37d80a2d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -861,10 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -917,17 +915,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (err) return err; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) return err; } - if ((attr->ia_valid & ATTR_UID && - !uid_eq(attr->ia_uid, inode->i_uid)) || - (attr->ia_valid & ATTR_GID && - !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(inode, attr); + err = dquot_transfer(mnt_userns, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -938,10 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index be599f31d3c4..d84c5f6cc09d 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -91,8 +91,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) unsigned int cnt; struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + unsigned long flags; - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { cnt = io_lat->bio_cnt[idx][io]; @@ -106,7 +107,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) io_lat->bio_cnt[idx][io] = 0; } } - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); trace_f2fs_iostat_latency(sbi, iostat_lat); } @@ -115,14 +116,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) { unsigned long long iostat_diff[NR_IO_TYPE]; int i; + unsigned long flags; if (time_is_after_jiffies(sbi->iostat_next_period)) return; /* Need double check under the lock */ - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irqsave(&sbi->iostat_lock, flags); if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); return; } sbi->iostat_next_period = jiffies + @@ -133,7 +135,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i]; sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; } - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); trace_f2fs_iostat(sbi, iostat_diff); @@ -145,25 +147,27 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irq(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { sbi->rw_iostat[i] = 0; sbi->prev_rw_iostat[i] = 0; } - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irq(&sbi->iostat_lock); - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irq(&sbi->iostat_lat_lock); memset(io_lat, 0, sizeof(struct iostat_lat_info)); - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irq(&sbi->iostat_lat_lock); } void f2fs_update_iostat(struct f2fs_sb_info *sbi, enum iostat_type type, unsigned long long io_bytes) { + unsigned long flags; + if (!sbi->iostat_enable) return; - spin_lock_bh(&sbi->iostat_lock); + spin_lock_irqsave(&sbi->iostat_lock, flags); sbi->rw_iostat[type] += io_bytes; if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) @@ -172,7 +176,7 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) sbi->rw_iostat[APP_READ_IO] += io_bytes; - spin_unlock_bh(&sbi->iostat_lock); + spin_unlock_irqrestore(&sbi->iostat_lock, flags); f2fs_record_iostat(sbi); } @@ -185,6 +189,7 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int idx; + unsigned long flags; if (!sbi->iostat_enable) return; @@ -202,12 +207,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, idx = WRITE_ASYNC_IO; } - spin_lock_bh(&sbi->iostat_lat_lock); + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); io_lat->sum_lat[idx][iotype] += ts_diff; io_lat->bio_cnt[idx][iotype]++; if (ts_diff > io_lat->peak_lat[idx][iotype]) io_lat->peak_lat[idx][iotype] = ts_diff; - spin_unlock_bh(&sbi->iostat_lat_lock); + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); } void iostat_update_and_unbind_ctx(struct bio *bio, int rw) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index c549acb52ac4..bf00d5057abb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -89,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); - if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); @@ -107,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, f2fs_init_extent_tree(inode, NULL); - stat_inc_inline_xattr(inode); - stat_inc_inline_inode(inode); - stat_inc_inline_dir(inode); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -127,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, set_compress_context(inode); } + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -325,6 +327,9 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, if (!is_extension_exist(name, ext[i], false)) continue; + /* Do not use inline_data with compression */ + stat_dec_inline_inode(inode); + clear_inode_flag(inode, FI_INLINE_DATA); set_compress_context(inode); return; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 836c79a20afc..740c72d088f3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1327,7 +1327,7 @@ fail: * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int op_flags) +static int read_node_page(struct page *page, blk_opf_t op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; @@ -1450,7 +1450,9 @@ page_hit: out_err: ClearPageUptodate(page); out_put_err: - f2fs_handle_page_eio(sbi, page->index, NODE); + /* ENOENT comes from read_node_page which is not an error. */ + if (err != -ENOENT) + f2fs_handle_page_eio(sbi, page->index, NODE); f2fs_put_page(page, 1); return ERR_PTR(err); } @@ -2163,9 +2165,7 @@ const struct address_space_operations f2fs_node_aops = { .dirty_folio = f2fs_dirty_node_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif + .migrate_folio = filemap_migrate_folio, }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3cb7f8a43b4d..dcd0a1e35095 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -255,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page) memset(&attr, 0, sizeof(attr)); - attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid); - attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid); + attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); + attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!uid_eq(attr.ia_uid, inode->i_uid)) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) attr.ia_valid |= ATTR_UID; - if (!gid_eq(attr.ia_gid, inode->i_gid)) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(inode, &attr); + err = dquot_transfer(&init_user_ns, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 874c1b9c41a2..c7afc588cf26 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1082,7 +1082,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - int flag = dpolicy->sync ? REQ_SYNC : 0; + blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; int err = 0; diff --git a/fs/fat/file.c b/fs/fat/file.c index 3dae3ed60f3a..3e4eb3467cb4 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ - err = security_inode_setattr(file->f_path.dentry, &ia); + err = security_inode_setattr(file_mnt_user_ns(file), + file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -516,9 +517,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (((attr->ia_valid & ATTR_UID) && - (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) || + (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid), + sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) || + (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid), + sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) error = -EPERM; diff --git a/fs/fcntl.c b/fs/fcntl.c index 34a3faa4886d..146c9ab0cd4b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -78,6 +78,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) } spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); + filp->f_iocb_flags = iocb_flags(filp); spin_unlock(&filp->f_lock); out: diff --git a/fs/file_table.c b/fs/file_table.c index 5424e3a8df5f..99c6796c9f28 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -45,7 +45,7 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp; static void file_free_rcu(struct rcu_head *head) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + struct file *f = container_of(head, struct file, f_rcuhead); put_cred(f->f_cred); kmem_cache_free(filp_cachep, f); @@ -56,7 +56,7 @@ static inline void file_free(struct file *f) security_file_free(f); if (!(f->f_mode & FMODE_NOACCOUNT)) percpu_counter_dec(&nr_files); - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + call_rcu(&f->f_rcuhead, file_free_rcu); } /* @@ -142,7 +142,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred) f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { - file_free_rcu(&f->f_u.fu_rcuhead); + file_free_rcu(&f->f_rcuhead); return ERR_PTR(error); } @@ -235,12 +235,15 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); + if (fop->llseek) + file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; + file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) @@ -341,13 +344,13 @@ static void delayed_fput(struct work_struct *unused) struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; - llist_for_each_entry_safe(f, t, node, f_u.fu_llist) + llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { - __fput(container_of(work, struct file, f_u.fu_rcuhead)); + __fput(container_of(work, struct file, f_rcuhead)); } /* @@ -374,8 +377,8 @@ void fput(struct file *file) struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_u.fu_rcuhead, ____fput); - if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) + init_task_work(&file->f_rcuhead, ____fput); + if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) return; /* * After this task has run exit_task_work(), @@ -384,7 +387,7 @@ void fput(struct file *file) */ } - if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) + if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index c2ef9f0debbd..9b49ec36e667 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -13,16 +13,6 @@ #include "vxfs_extern.h" #include "vxfs_inode.h" - -static int vxfs_immed_read_folio(struct file *, struct folio *); - -/* - * Address space operations for immed files and directories. - */ -const struct address_space_operations vxfs_immed_aops = { - .read_folio = vxfs_immed_read_folio, -}; - /** * vxfs_immed_read_folio - read part of an immed inode into pagecache * @file: file context (unused) @@ -30,7 +20,7 @@ const struct address_space_operations vxfs_immed_aops = { * * Description: * vxfs_immed_read_folio reads a part of the immed area of the - * file that hosts @pp into the pagecache. + * file that hosts @folio into the pagecache. * * Returns: * Zero on success, else a negative error code. @@ -38,21 +28,26 @@ const struct address_space_operations vxfs_immed_aops = { * Locking status: * @folio is locked and will be unlocked. */ -static int -vxfs_immed_read_folio(struct file *fp, struct folio *folio) +static int vxfs_immed_read_folio(struct file *fp, struct folio *folio) { - struct page *pp = &folio->page; - struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); - u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT; - caddr_t kaddr; + struct vxfs_inode_info *vip = VXFS_INO(folio->mapping->host); + void *src = vip->vii_immed.vi_immed + folio_pos(folio); + unsigned long i; - kaddr = kmap(pp); - memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE); - kunmap(pp); - - flush_dcache_page(pp); - SetPageUptodate(pp); - unlock_page(pp); + for (i = 0; i < folio_nr_pages(folio); i++) { + memcpy_to_page(folio_page(folio, i), 0, src, PAGE_SIZE); + src += PAGE_SIZE; + } + + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; } + +/* + * Address space operations for immed files and directories. + */ +const struct address_space_operations vxfs_immed_aops = { + .read_folio = vxfs_immed_read_folio, +}; diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c index 0e633d2bfc7d..c99282df7761 100644 --- a/fs/freevxfs/vxfs_subr.c +++ b/fs/freevxfs/vxfs_subr.c @@ -51,15 +51,9 @@ vxfs_get_page(struct address_space *mapping, u_long n) kmap(pp); /** if (!PageChecked(pp)) **/ /** vxfs_check_page(pp); **/ - if (PageError(pp)) - goto fail; } return (pp); - -fail: - vxfs_put_page(pp); - return ERR_PTR(-EIO); } /** diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a21d8f1a56d1..05221366a16d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -120,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode, struct list_head *head) { assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); list_move(&inode->i_io_list, head); @@ -1365,9 +1366,9 @@ static int move_expired_inodes(struct list_head *delaying_queue, inode = wb_inode(delaying_queue->prev); if (inode_dirtied_after(inode, dirtied_before)) break; + spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; - spin_lock(&inode->i_lock); inode->i_state |= I_SYNC_QUEUED; spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) @@ -1383,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue, goto out; } - /* Move inodes from one superblock together */ + /* + * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', + * we don't take inode->i_lock here because it is just a pointless overhead. + * Inode is already marked as I_SYNC_QUEUED so writeback list handling is + * fully under our control. + */ while (!list_empty(&tmp)) { sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { @@ -1826,8 +1832,8 @@ static long writeback_sb_inodes(struct super_block *sb, * We'll have another go at writing back this inode * when we completed a full scan of b_io. */ - spin_unlock(&inode->i_lock); requeue_io(inode, wb); + spin_unlock(&inode->i_lock); trace_writeback_sb_inodes_requeue(inode); continue; } @@ -2358,6 +2364,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; int dirtytime = 0; + struct bdi_writeback *wb = NULL; trace_writeback_mark_inode_dirty(inode, flags); @@ -2410,13 +2417,24 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->i_state |= flags; /* + * Grab inode's wb early because it requires dropping i_lock and we + * need to make sure following checks happen atomically with dirty + * list handling so that we don't move inodes under flush worker's + * hands. + */ + if (!was_dirty) { + wb = locked_inode_to_wb_and_lock_list(inode); + spin_lock(&inode->i_lock); + } + + /* * If the inode is queued for writeback by flush worker, just * update its dirty state. Once the flush worker is done with * the inode it will place it on the appropriate superblock * list, based upon its state. */ if (inode->i_state & I_SYNC_QUEUED) - goto out_unlock_inode; + goto out_unlock; /* * Only add valid (hashed) inodes to the superblock's @@ -2424,22 +2442,19 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out_unlock_inode; + goto out_unlock; } if (inode->i_state & I_FREEING) - goto out_unlock_inode; + goto out_unlock; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb; struct list_head *dirty_list; bool wakeup_bdi = false; - wb = locked_inode_to_wb_and_lock_list(inode); - inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; @@ -2453,6 +2468,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) dirty_list); spin_unlock(&wb->list_lock); + spin_unlock(&inode->i_lock); trace_writeback_dirty_inode_enqueue(inode); /* @@ -2467,6 +2483,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) return; } } +out_unlock: + if (wb) + spin_unlock(&wb->list_lock); out_unlock_inode: spin_unlock(&inode->i_lock); } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 9d3cf0111709..74920826d8f6 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -372,17 +372,22 @@ nomem: return NULL; } +static inline bool fscache_cookie_is_dropped(struct fscache_cookie *cookie) +{ + return READ_ONCE(cookie->state) == FSCACHE_COOKIE_STATE_DROPPED; +} + static void fscache_wait_on_collision(struct fscache_cookie *candidate, struct fscache_cookie *wait_for) { enum fscache_cookie_state *statep = &wait_for->state; - wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED, + wait_var_event_timeout(statep, fscache_cookie_is_dropped(wait_for), 20 * HZ); - if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) { + if (!fscache_cookie_is_dropped(wait_for)) { pr_notice("Potential collision c=%08x old: c=%08x", candidate->debug_id, wait_for->debug_id); - wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED); + wait_var_event(statep, fscache_cookie_is_dropped(wait_for)); } } @@ -517,7 +522,14 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) } fscache_see_cookie(cookie, fscache_cookie_see_active); - fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + spin_lock(&cookie->lock); + if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags)) + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_INVALIDATING); + else + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + spin_unlock(&cookie->lock); + wake_up_cookie_state(cookie); trace = fscache_access_lookup_cookie_end; out: @@ -752,6 +764,9 @@ again_locked: spin_lock(&cookie->lock); } + if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags)) + fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end); + switch (state) { case FSCACHE_COOKIE_STATE_RELINQUISHING: fscache_see_cookie(cookie, fscache_cookie_see_relinquish); @@ -1048,6 +1063,9 @@ void __fscache_invalidate(struct fscache_cookie *cookie, return; case FSCACHE_COOKIE_STATE_LOOKING_UP: + __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie); + set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags); + fallthrough; case FSCACHE_COOKIE_STATE_CREATING: spin_unlock(&cookie->lock); _leave(" [look %x]", cookie->inval_counter); diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index f2aa7dbad766..a058e0136bfe 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -143,7 +143,7 @@ static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, { wait_var_event_timeout(&candidate->flags, !fscache_is_acquire_pending(candidate), 20 * HZ); - if (!fscache_is_acquire_pending(candidate)) { + if (fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); @@ -182,7 +182,7 @@ static bool fscache_hash_volume(struct fscache_volume *candidate) hlist_bl_add_head(&candidate->hash_link, h); hlist_bl_unlock(h); - if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags)) + if (fscache_is_acquire_pending(candidate)) fscache_wait_on_volume_collision(candidate, collidee_debug_id); return true; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 05caa2b9272e..00fa861aeead 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1042,7 +1042,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb) { unsigned int flags = iocb->ki_filp->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) flags |= O_DSYNC; if (iocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 106e90a36583..57ff883d432c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -774,7 +774,7 @@ static const struct address_space_operations gfs2_aops = { .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, .direct_IO = noop_direct_IO, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index b6697333bb2b..3bdb2c668a71 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -310,9 +310,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, - REQ_RAHEAD | REQ_META | REQ_PRIO, - rabh); + submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META | + REQ_PRIO, rabh); continue; } unlock_buffer(rabh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 42b7dfffb5e7..a0562dd1bada 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1508,9 +1508,8 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, - REQ_RAHEAD | REQ_META | REQ_PRIO, - bh); + submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META | + REQ_PRIO, bh); continue; } brelse(bh); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f0ee3ff6f9a8..eec4159b08aa 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -823,7 +823,7 @@ void gfs2_flush_revokes(struct gfs2_sbd *sdp) void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 seq, u32 tail, u32 lblock, u32 flags, - int op_flags) + blk_opf_t op_flags) { struct gfs2_log_header *lh; u32 hash, crc; @@ -905,7 +905,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { - int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; + blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index fc905c2af53c..653cffcbf869 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -82,7 +82,7 @@ extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, unsigned int *extra_revokes); extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 seq, u32 tail, u32 lblock, u32 flags, - int op_flags); + blk_opf_t op_flags); extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 6ba51cbb94cf..1902413d5d12 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -238,7 +238,7 @@ static void gfs2_end_log_write(struct bio *bio) * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int opf) +void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf) { struct bio *bio = *biop; if (bio) { @@ -292,7 +292,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, */ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, - struct bio **biop, int op, + struct bio **biop, enum req_op op, bio_end_io_t *end_io, bool flush) { struct bio *bio = *biop; @@ -452,36 +452,36 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, * @head: The journal head to start from * @done: If set, perform only cleanup, else search and set if found. * - * Find the page with 'index' in the journal's mapping. Search the page for + * Find the folio with 'index' in the journal's mapping. Search the folio for * the journal head if requested (cleanup == false). Release refs on the - * page so the page cache can reclaim it (put_page() twice). We grabbed a - * reference on this page two times, first when we did a find_or_create_page() - * to obtain the page to add it to the bio and second when we do a - * find_get_page() here to get the page to wait on while I/O on it is being + * folio so the page cache can reclaim it. We grabbed a + * reference on this folio twice, first when we did a find_or_create_page() + * to obtain the folio to add it to the bio and second when we do a + * filemap_get_folio() here to get the folio to wait on while I/O on it is being * completed. - * This function is also used to free up a page we might've grabbed but not + * This function is also used to free up a folio we might've grabbed but not * used. Maybe we added it to a bio, but not submitted it for I/O. Or we * submitted the I/O, but we already found the jhead so we only need to drop - * our references to the page. + * our references to the folio. */ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, struct gfs2_log_header_host *head, bool *done) { - struct page *page; + struct folio *folio; - page = find_get_page(jd->jd_inode->i_mapping, index); - wait_on_page_locked(page); + folio = filemap_get_folio(jd->jd_inode->i_mapping, index); - if (PageError(page)) + folio_wait_locked(folio); + if (folio_test_error(folio)) *done = true; if (!*done) - *done = gfs2_jhead_pg_srch(jd, head, page); + *done = gfs2_jhead_pg_srch(jd, head, &folio->page); - put_page(page); /* Once for find_get_page */ - put_page(page); /* Once more for find_or_create_page */ + /* filemap_get_folio() and the earlier find_or_create_page() */ + folio_put_refs(folio, 2); } static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index f707601597dc..1412ffba1d44 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -16,7 +16,7 @@ extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, struct page *page, unsigned size, unsigned offset, u64 blkno); -extern void gfs2_log_submit_bio(struct bio **biop, int opf); +extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 868dcc71b581..7e70e0ba5a6c 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -34,7 +34,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); + blk_opf_t write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -75,7 +75,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, write_flags, bh); + submit_bh(REQ_OP_WRITE | write_flags, bh); nr_underway++; } bh = next; @@ -217,14 +217,13 @@ static void gfs2_meta_read_endio(struct bio *bio) * Submit several consecutive buffer head I/O requests as a single bio I/O * request. (See submit_bh_wbc.) */ -static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], - int num) +static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) { while (num > 0) { struct buffer_head *bh = *bhs; struct bio *bio; - bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO); + bio = bio_alloc(bh->b_bdev, num, opf, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); while (num > 0) { bh = *bhs; @@ -288,7 +287,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } } - gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num); + gfs2_submit_bhs(REQ_OP_READ | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; @@ -527,7 +526,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh); + ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; @@ -536,9 +535,8 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ, - REQ_RAHEAD | REQ_META | REQ_PRIO, - 1, &bh); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META | + REQ_PRIO, 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 59d727a4ae2c..c98a7faa67d3 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -746,7 +746,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); + ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index c0a73a6ffb28..c83fd0e8404d 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -296,10 +296,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page = read_mapping_page(mapping, block++, NULL); if (IS_ERR(page)) goto fail; - if (PageError(page)) { - put_page(page); - goto fail; - } node->page[i] = page; } diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 177fae4e6581..a5ab00e54220 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -447,10 +447,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page = read_mapping_page(mapping, block, NULL); if (IS_ERR(page)) goto fail; - if (PageError(page)) { - put_page(page); - goto fail; - } node->page[i] = page; } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 396e73aa0961..a5db2e3b2980 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -525,7 +525,7 @@ int hfsplus_compare_dentry(const struct dentry *dentry, unsigned int len, /* wrapper.c */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, - void **data, int op, int op_flags); + void **data, blk_opf_t opf); int hfsplus_read_wrapper(struct super_block *sb); /* diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 63164ebc52fa..9ec21664eda6 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -112,8 +112,7 @@ static int hfs_parse_new_pmap(struct super_block *sb, void *buf, if ((u8 *)pm - (u8 *)buf >= buf_size) { res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK + i, - buf, (void **)&pm, REQ_OP_READ, - 0); + buf, (void **)&pm, REQ_OP_READ); if (res) return res; } @@ -137,7 +136,7 @@ int hfs_part_find(struct super_block *sb, return -ENOMEM; res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, - buf, &data, REQ_OP_READ, 0); + buf, &data, REQ_OP_READ); if (res) goto out; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 8479add998b5..122ed89ebf9f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr_buf, NULL, REQ_OP_WRITE, + sbi->s_vhdr_buf, NULL, REQ_OP_WRITE | REQ_SYNC); if (!error) error = error2; @@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, - sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE, + sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE | REQ_SYNC); if (!error) error2 = error; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0b8ad6586df5..0b791adf02e5 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -45,8 +45,9 @@ struct hfsplus_wd { * will work correctly. */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, - void *buf, void **data, int op, int op_flags) + void *buf, void **data, blk_opf_t opf) { + const enum req_op op = opf & REQ_OP_MASK; struct bio *bio; int ret = 0; u64 io_size; @@ -63,10 +64,10 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, offset = start & (io_size - 1); sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); - bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO); + bio = bio_alloc(sb->s_bdev, 1, opf, GFP_NOIO); bio->bi_iter.bi_sector = sector; - if (op != WRITE && data) + if (op != REQ_OP_WRITE && data) *data = (u8 *)buf + offset; while (io_size > 0) { @@ -184,7 +185,7 @@ int hfsplus_read_wrapper(struct super_block *sb) reread: error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, - REQ_OP_READ, 0); + REQ_OP_READ); if (error) goto out_free_backup_vhdr; @@ -216,8 +217,7 @@ reread: error = hfsplus_submit_bio(sb, part_start + part_size - 2, sbi->s_backup_vhdr_buf, - (void **)&sbi->s_backup_vhdr, REQ_OP_READ, - 0); + (void **)&sbi->s_backup_vhdr, REQ_OP_READ); if (error) goto out_free_backup_vhdr; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cc1bc6f93a01..07881b76d42f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -416,15 +416,15 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); if (err != count) { - ClearPageUptodate(page); + if (err >= 0) + err = -EIO; + mapping_set_error(mapping, err); goto out; } if (base > inode->i_size) inode->i_size = base; - if (PageError(page)) - ClearPageError(page); err = 0; out: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 62408047e8d7..20336cb3c040 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,16 +108,6 @@ static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) } #endif -static void huge_pagevec_release(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); ++i) - put_page(pvec->pages[i]); - - pagevec_reinit(pvec); -} - /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -480,25 +470,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); const pgoff_t end = lend >> huge_page_shift(h); - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - pagevec_init(&pvec); + folio_batch_init(&fbatch); next = start; - while (next < end) { - /* - * When no more pages are found, we are done. - */ - if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1)) - break; - - for (i = 0; i < pagevec_count(&pvec); ++i) { - struct page *page = pvec.pages[i]; + while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); ++i) { + struct folio *folio = fbatch.folios[i]; u32 hash = 0; - index = page->index; + index = folio->index; if (!truncate_op) { /* * Only need to hold the fault mutex in the @@ -511,15 +495,15 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } /* - * If page is mapped, it was faulted in after being + * If folio is mapped, it was faulted in after being * unmapped in caller. Unmap (again) now after taking * the fault mutex. The mutex will prevent faults - * until we finish removing the page. + * until we finish removing the folio. * * This race can only happen in the hole punch case. * Getting here in a truncate operation is a bug. */ - if (unlikely(page_mapped(page))) { + if (unlikely(folio_mapped(folio))) { BUG_ON(truncate_op); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -532,7 +516,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, i_mmap_unlock_write(mapping); } - lock_page(page); + folio_lock(folio); /* * We must free the huge page and remove from page * cache (remove_huge_page) BEFORE removing the @@ -542,8 +526,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * the subpool and global reserve usage count can need * to be adjusted. */ - VM_BUG_ON(HPageRestoreReserve(page)); - remove_huge_page(page); + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + remove_huge_page(&folio->page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, @@ -551,11 +535,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, hugetlb_fix_reserve_counts(inode); } - unlock_page(page); + folio_unlock(folio); if (!truncate_op) mutex_unlock(&hugetlb_fault_mutex_table[hash]); } - huge_pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } @@ -600,41 +584,79 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) remove_inode_hugepages(inode, offset, LLONG_MAX); } +static void hugetlbfs_zero_partial_page(struct hstate *h, + struct address_space *mapping, + loff_t start, + loff_t end) +{ + pgoff_t idx = start >> huge_page_shift(h); + struct folio *folio; + + folio = filemap_lock_folio(mapping, idx); + if (!folio) + return; + + start = start & ~huge_page_mask(h); + end = end & ~huge_page_mask(h); + if (!end) + end = huge_page_size(h); + + folio_zero_segment(folio, (size_t)start, (size_t)end); + + folio_unlock(folio); + folio_put(folio); +} + static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); loff_t hpage_size = huge_page_size(h); loff_t hole_start, hole_end; /* - * For hole punch round up the beginning offset of the hole and - * round down the end. + * hole_start and hole_end indicate the full pages within the hole. */ hole_start = round_up(offset, hpage_size); hole_end = round_down(offset + len, hpage_size); - if (hole_end > hole_start) { - struct address_space *mapping = inode->i_mapping; - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + inode_lock(inode); - inode_lock(inode); + /* protected by i_rwsem */ + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { + inode_unlock(inode); + return -EPERM; + } - /* protected by i_rwsem */ - if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - inode_unlock(inode); - return -EPERM; - } + i_mmap_lock_write(mapping); + + /* If range starts before first full page, zero partial page. */ + if (offset < hole_start) + hugetlbfs_zero_partial_page(h, mapping, + offset, min(offset + len, hole_start)); - i_mmap_lock_write(mapping); + /* Unmap users of full pages in the hole. */ + if (hole_end > hole_start) { if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT, 0); - i_mmap_unlock_write(mapping); - remove_inode_hugepages(inode, hole_start, hole_end); - inode_unlock(inode); } + /* If range extends beyond last full page, zero partial page. */ + if ((offset + len) > hole_end && (offset + len) > hole_start) + hugetlbfs_zero_partial_page(h, mapping, + hole_end, offset + len); + + i_mmap_unlock_write(mapping); + + /* Remove full pages from the file. */ + if (hole_end > hole_start) + remove_inode_hugepages(inode, hole_start, hole_end); + + inode_unlock(inode); + return 0; } @@ -759,7 +781,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, SetHPageMigratable(page); /* - * unlock_page because locked by add_to_page_cache() + * unlock_page because locked by huge_add_to_page_cache() * put_page() due to reference from alloc_huge_page() */ unlock_page(page); @@ -970,28 +992,33 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns, return error; } -static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, +#ifdef CONFIG_MIGRATION +static int hugetlbfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc; - rc = migrate_huge_page_move_mapping(mapping, newpage, page); + rc = migrate_huge_page_move_mapping(mapping, dst, src); if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (hugetlb_page_subpool(page)) { - hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page)); - hugetlb_set_page_subpool(page, NULL); + if (hugetlb_page_subpool(&src->page)) { + hugetlb_set_page_subpool(&dst->page, + hugetlb_page_subpool(&src->page)); + hugetlb_set_page_subpool(&src->page, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(dst, src); else - migrate_page_states(newpage, page); + folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } +#else +#define hugetlbfs_migrate_folio NULL +#endif static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) @@ -1158,7 +1185,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, - .migratepage = hugetlbfs_migrate_page, + .migrate_folio = hugetlbfs_migrate_folio, .error_remove_page = hugetlbfs_error_remove_page, }; diff --git a/fs/inode.c b/fs/inode.c index 9d9b422504d1..524ee91f74a6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -27,7 +27,7 @@ * Inode locking rules: * * inode->i_lock protects: - * inode->i_state, inode->i_hash, __iget() + * inode->i_state, inode->i_hash, __iget(), inode->i_io_list * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode->i_sb->s_inode_list_lock protects: @@ -604,7 +604,7 @@ void clear_inode(struct inode *inode) { /* * We have to cycle the i_pages lock here because reclaim can be in the - * process of removing the last page (in __delete_from_page_cache()) + * process of removing the last page (in __filemap_remove_folio()) * and we must not free the mapping under it. */ xa_lock_irq(&inode->i_data.i_pages); @@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns, return notify_change(mnt_userns, dentry, &newattrs, NULL); } -/* - * Remove special file priviledges (suid, capabilities) when file is written - * to or truncated. - */ -int file_remove_privs(struct file *file) +static int __file_remove_privs(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); + int error; int kill; - int error = 0; - /* - * Fast path for nothing security related. - * As well for non-regular files, e.g. blkdev inodes. - * For example, blkdev_write_iter() might get here - * trying to remove privs which it is not allowed to. - */ if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); - if (kill < 0) + if (kill <= 0) return kill; - if (kill) - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL(file_remove_privs); /** - * file_update_time - update mtime and ctime time - * @file: file accessed + * file_remove_privs - remove special file privileges (suid, capabilities) + * @file: file to remove privileges from + * + * When file is modified by a write or truncation ensure that special + * file privileges are removed. * - * Update the mtime and ctime members of an inode and mark the inode - * for writeback. Note that this function is meant exclusively for - * usage in the file write path of filesystems, and filesystems may - * choose to explicitly ignore update via this function with the - * S_NOCMTIME inode flag, e.g. for network filesystem where these - * timestamps are handled by the server. This can return an error for - * file systems who need to allocate space in order to update an inode. + * Return: 0 on success, negative errno on failure. */ +int file_remove_privs(struct file *file) +{ + return __file_remove_privs(file, 0); +} +EXPORT_SYMBOL(file_remove_privs); -int file_update_time(struct file *file) +static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) { - struct inode *inode = file_inode(file); - struct timespec64 now; int sync_it = 0; - int ret; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; - now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) + if (!timespec64_equal(&inode->i_mtime, now)) sync_it = S_MTIME; - if (!timespec64_equal(&inode->i_ctime, &now)) + if (!timespec64_equal(&inode->i_ctime, now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2079,37 +2069,127 @@ int file_update_time(struct file *file) if (!sync_it) return 0; - /* Finally allowed to write? Takes lock. */ - if (__mnt_want_write_file(file)) - return 0; + return sync_it; +} + +static int __file_update_time(struct file *file, struct timespec64 *now, + int sync_mode) +{ + int ret = 0; + struct inode *inode = file_inode(file); - ret = inode_update_time(inode, &now, sync_it); - __mnt_drop_write_file(file); + /* try to update time settings */ + if (!__mnt_want_write_file(file)) { + ret = inode_update_time(inode, now, sync_mode); + __mnt_drop_write_file(file); + } return ret; } + +/** + * file_update_time - update mtime and ctime time + * @file: file accessed + * + * Update the mtime and ctime members of an inode and mark the inode for + * writeback. Note that this function is meant exclusively for usage in + * the file write path of filesystems, and filesystems may choose to + * explicitly ignore updates via this function with the _NOCMTIME inode + * flag, e.g. for network filesystem where these imestamps are handled + * by the server. This can return an error for file systems who need to + * allocate space in order to update an inode. + * + * Return: 0 on success, negative errno on failure. + */ +int file_update_time(struct file *file) +{ + int ret; + struct inode *inode = file_inode(file); + struct timespec64 now = current_time(inode); + + ret = inode_needs_update_time(inode, &now); + if (ret <= 0) + return ret; + + return __file_update_time(file, &now, ret); +} EXPORT_SYMBOL(file_update_time); -/* Caller must hold the file's inode lock */ -int file_modified(struct file *file) +/** + * file_modified_flags - handle mandated vfs changes when modifying a file + * @file: file that was modified + * @flags: kiocb flags + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * If IOCB_NOWAIT is set, special file privileges will not be removed and + * time settings will not be updated. It will return -EAGAIN. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +static int file_modified_flags(struct file *file, int flags) { - int err; + int ret; + struct inode *inode = file_inode(file); + struct timespec64 now = current_time(inode); /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - err = file_remove_privs(file); - if (err) - return err; + ret = __file_remove_privs(file, flags); + if (ret) + return ret; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; - return file_update_time(file); + ret = inode_needs_update_time(inode, &now); + if (ret <= 0) + return ret; + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + return __file_update_time(file, &now, ret); +} + +/** + * file_modified - handle mandated vfs changes when modifying a file + * @file: file that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int file_modified(struct file *file) +{ + return file_modified_flags(file, 0); } EXPORT_SYMBOL(file_modified); +/** + * kiocb_modified - handle mandated vfs changes when modifying a file + * @iocb: iocb that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int kiocb_modified(struct kiocb *iocb) +{ + return file_modified_flags(iocb->ki_filp, iocb->ki_flags); +} +EXPORT_SYMBOL_GPL(kiocb_modified); + int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) diff --git a/fs/io-wq.c b/fs/io-wq.c deleted file mode 100644 index 824623bcf1a5..000000000000 --- a/fs/io-wq.c +++ /dev/null @@ -1,1424 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Basic worker thread pool for io_uring - * - * Copyright (C) 2019 Jens Axboe - * - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/sched/signal.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/rculist_nulls.h> -#include <linux/cpu.h> -#include <linux/task_work.h> -#include <linux/audit.h> -#include <uapi/linux/io_uring.h> - -#include "io-wq.h" - -#define WORKER_IDLE_TIMEOUT (5 * HZ) - -enum { - IO_WORKER_F_UP = 1, /* up and active */ - IO_WORKER_F_RUNNING = 2, /* account as running */ - IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ -}; - -enum { - IO_WQ_BIT_EXIT = 0, /* wq exiting */ -}; - -enum { - IO_ACCT_STALLED_BIT = 0, /* stalled on hash */ -}; - -/* - * One for each thread in a wqe pool - */ -struct io_worker { - refcount_t ref; - unsigned flags; - struct hlist_nulls_node nulls_node; - struct list_head all_list; - struct task_struct *task; - struct io_wqe *wqe; - - struct io_wq_work *cur_work; - struct io_wq_work *next_work; - raw_spinlock_t lock; - - struct completion ref_done; - - unsigned long create_state; - struct callback_head create_work; - int create_index; - - union { - struct rcu_head rcu; - struct work_struct work; - }; -}; - -#if BITS_PER_LONG == 64 -#define IO_WQ_HASH_ORDER 6 -#else -#define IO_WQ_HASH_ORDER 5 -#endif - -#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) - -struct io_wqe_acct { - unsigned nr_workers; - unsigned max_workers; - int index; - atomic_t nr_running; - raw_spinlock_t lock; - struct io_wq_work_list work_list; - unsigned long flags; -}; - -enum { - IO_WQ_ACCT_BOUND, - IO_WQ_ACCT_UNBOUND, - IO_WQ_ACCT_NR, -}; - -/* - * Per-node worker thread pool - */ -struct io_wqe { - raw_spinlock_t lock; - struct io_wqe_acct acct[IO_WQ_ACCT_NR]; - - int node; - - struct hlist_nulls_head free_list; - struct list_head all_list; - - struct wait_queue_entry wait; - - struct io_wq *wq; - struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; - - cpumask_var_t cpu_mask; -}; - -/* - * Per io_wq state - */ -struct io_wq { - unsigned long state; - - free_work_fn *free_work; - io_wq_work_fn *do_work; - - struct io_wq_hash *hash; - - atomic_t worker_refs; - struct completion worker_done; - - struct hlist_node cpuhp_node; - - struct task_struct *task; - - struct io_wqe *wqes[]; -}; - -static enum cpuhp_state io_wq_online; - -struct io_cb_cancel_data { - work_cancel_fn *fn; - void *data; - int nr_running; - int nr_pending; - bool cancel_all; -}; - -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); -static void io_wqe_dec_running(struct io_worker *worker); -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, - struct io_cb_cancel_data *match); -static void create_worker_cb(struct callback_head *cb); -static void io_wq_cancel_tw_create(struct io_wq *wq); - -static bool io_worker_get(struct io_worker *worker) -{ - return refcount_inc_not_zero(&worker->ref); -} - -static void io_worker_release(struct io_worker *worker) -{ - if (refcount_dec_and_test(&worker->ref)) - complete(&worker->ref_done); -} - -static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound) -{ - return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; -} - -static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, - struct io_wq_work *work) -{ - return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND)); -} - -static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) -{ - return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND); -} - -static void io_worker_ref_put(struct io_wq *wq) -{ - if (atomic_dec_and_test(&wq->worker_refs)) - complete(&wq->worker_done); -} - -static void io_worker_cancel_cb(struct io_worker *worker) -{ - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - - atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); - acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); - io_worker_ref_put(wq); - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); -} - -static bool io_task_worker_match(struct callback_head *cb, void *data) -{ - struct io_worker *worker; - - if (cb->func != create_worker_cb) - return false; - worker = container_of(cb, struct io_worker, create_work); - return worker == data; -} - -static void io_worker_exit(struct io_worker *worker) -{ - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - - while (1) { - struct callback_head *cb = task_work_cancel_match(wq->task, - io_task_worker_match, worker); - - if (!cb) - break; - io_worker_cancel_cb(worker); - } - - io_worker_release(worker); - wait_for_completion(&worker->ref_done); - - raw_spin_lock(&wqe->lock); - if (worker->flags & IO_WORKER_F_FREE) - hlist_nulls_del_rcu(&worker->nulls_node); - list_del_rcu(&worker->all_list); - raw_spin_unlock(&wqe->lock); - io_wqe_dec_running(worker); - worker->flags = 0; - preempt_disable(); - current->flags &= ~PF_IO_WORKER; - preempt_enable(); - - kfree_rcu(worker, rcu); - io_worker_ref_put(wqe->wq); - do_exit(0); -} - -static inline bool io_acct_run_queue(struct io_wqe_acct *acct) -{ - bool ret = false; - - raw_spin_lock(&acct->lock); - if (!wq_list_empty(&acct->work_list) && - !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - ret = true; - raw_spin_unlock(&acct->lock); - - return ret; -} - -/* - * Check head of free list for an available worker. If one isn't available, - * caller must create one. - */ -static bool io_wqe_activate_free_worker(struct io_wqe *wqe, - struct io_wqe_acct *acct) - __must_hold(RCU) -{ - struct hlist_nulls_node *n; - struct io_worker *worker; - - /* - * Iterate free_list and see if we can find an idle worker to - * activate. If a given worker is on the free_list but in the process - * of exiting, keep trying. - */ - hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { - if (!io_worker_get(worker)) - continue; - if (io_wqe_get_acct(worker) != acct) { - io_worker_release(worker); - continue; - } - if (wake_up_process(worker->task)) { - io_worker_release(worker); - return true; - } - io_worker_release(worker); - } - - return false; -} - -/* - * We need a worker. If we find a free one, we're good. If not, and we're - * below the max number of workers, create one. - */ -static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) -{ - /* - * Most likely an attempt to queue unbounded work on an io_wq that - * wasn't setup with any unbounded workers. - */ - if (unlikely(!acct->max_workers)) - pr_warn_once("io-wq is not configured for unbound workers"); - - raw_spin_lock(&wqe->lock); - if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wqe->lock); - return true; - } - acct->nr_workers++; - raw_spin_unlock(&wqe->lock); - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - return create_io_worker(wqe->wq, wqe, acct->index); -} - -static void io_wqe_inc_running(struct io_worker *worker) -{ - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - - atomic_inc(&acct->nr_running); -} - -static void create_worker_cb(struct callback_head *cb) -{ - struct io_worker *worker; - struct io_wq *wq; - struct io_wqe *wqe; - struct io_wqe_acct *acct; - bool do_create = false; - - worker = container_of(cb, struct io_worker, create_work); - wqe = worker->wqe; - wq = wqe->wq; - acct = &wqe->acct[worker->create_index]; - raw_spin_lock(&wqe->lock); - if (acct->nr_workers < acct->max_workers) { - acct->nr_workers++; - do_create = true; - } - raw_spin_unlock(&wqe->lock); - if (do_create) { - create_io_worker(wq, wqe, worker->create_index); - } else { - atomic_dec(&acct->nr_running); - io_worker_ref_put(wq); - } - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); -} - -static bool io_queue_worker_create(struct io_worker *worker, - struct io_wqe_acct *acct, - task_work_func_t func) -{ - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - - /* raced with exit, just ignore create call */ - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) - goto fail; - if (!io_worker_get(worker)) - goto fail; - /* - * create_state manages ownership of create_work/index. We should - * only need one entry per worker, as the worker going to sleep - * will trigger the condition, and waking will clear it once it - * runs the task_work. - */ - if (test_bit(0, &worker->create_state) || - test_and_set_bit_lock(0, &worker->create_state)) - goto fail_release; - - atomic_inc(&wq->worker_refs); - init_task_work(&worker->create_work, func); - worker->create_index = acct->index; - if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { - /* - * EXIT may have been set after checking it above, check after - * adding the task_work and remove any creation item if it is - * now set. wq exit does that too, but we can have added this - * work item after we canceled in io_wq_exit_workers(). - */ - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) - io_wq_cancel_tw_create(wq); - io_worker_ref_put(wq); - return true; - } - io_worker_ref_put(wq); - clear_bit_unlock(0, &worker->create_state); -fail_release: - io_worker_release(worker); -fail: - atomic_dec(&acct->nr_running); - io_worker_ref_put(wq); - return false; -} - -static void io_wqe_dec_running(struct io_worker *worker) -{ - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - - if (!(worker->flags & IO_WORKER_F_UP)) - return; - - if (!atomic_dec_and_test(&acct->nr_running)) - return; - if (!io_acct_run_queue(acct)) - return; - - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - io_queue_worker_create(worker, acct, create_worker_cb); -} - -/* - * Worker will start processing some work. Move it to the busy list, if - * it's currently on the freelist - */ -static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) -{ - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; - raw_spin_lock(&wqe->lock); - hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wqe->lock); - } -} - -/* - * No work, worker going to sleep. Move to freelist, and unuse mm if we - * have one attached. Dropping the mm may potentially sleep, so we drop - * the lock in that case and return success. Since the caller has to - * retry the loop in that case (we changed task state), we don't regrab - * the lock if we return success. - */ -static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) - __must_hold(wqe->lock) -{ - if (!(worker->flags & IO_WORKER_F_FREE)) { - worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - } -} - -static inline unsigned int io_get_work_hash(struct io_wq_work *work) -{ - return work->flags >> IO_WQ_HASH_SHIFT; -} - -static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) -{ - struct io_wq *wq = wqe->wq; - bool ret = false; - - spin_lock_irq(&wq->hash->wait.lock); - if (list_empty(&wqe->wait.entry)) { - __add_wait_queue(&wq->hash->wait, &wqe->wait); - if (!test_bit(hash, &wq->hash->map)) { - __set_current_state(TASK_RUNNING); - list_del_init(&wqe->wait.entry); - ret = true; - } - } - spin_unlock_irq(&wq->hash->wait.lock); - return ret; -} - -static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, - struct io_worker *worker) - __must_hold(acct->lock) -{ - struct io_wq_work_node *node, *prev; - struct io_wq_work *work, *tail; - unsigned int stall_hash = -1U; - struct io_wqe *wqe = worker->wqe; - - wq_list_for_each(node, prev, &acct->work_list) { - unsigned int hash; - - work = container_of(node, struct io_wq_work, list); - - /* not hashed, can run anytime */ - if (!io_wq_is_hashed(work)) { - wq_list_del(&acct->work_list, node, prev); - return work; - } - - hash = io_get_work_hash(work); - /* all items with this hash lie in [work, tail] */ - tail = wqe->hash_tail[hash]; - - /* hashed, can run if not already running */ - if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { - wqe->hash_tail[hash] = NULL; - wq_list_cut(&acct->work_list, &tail->list, prev); - return work; - } - if (stall_hash == -1U) - stall_hash = hash; - /* fast forward to a next hash, for-each will fix up @prev */ - node = &tail->list; - } - - if (stall_hash != -1U) { - bool unstalled; - - /* - * Set this before dropping the lock to avoid racing with new - * work being added and clearing the stalled bit. - */ - set_bit(IO_ACCT_STALLED_BIT, &acct->flags); - raw_spin_unlock(&acct->lock); - unstalled = io_wait_on_hash(wqe, stall_hash); - raw_spin_lock(&acct->lock); - if (unstalled) { - clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - if (wq_has_sleeper(&wqe->wq->hash->wait)) - wake_up(&wqe->wq->hash->wait); - } - } - - return NULL; -} - -static bool io_flush_signals(void) -{ - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { - __set_current_state(TASK_RUNNING); - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - return true; - } - return false; -} - -static void io_assign_current_work(struct io_worker *worker, - struct io_wq_work *work) -{ - if (work) { - io_flush_signals(); - cond_resched(); - } - - raw_spin_lock(&worker->lock); - worker->cur_work = work; - worker->next_work = NULL; - raw_spin_unlock(&worker->lock); -} - -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); - -static void io_worker_handle_work(struct io_worker *worker) -{ - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); - - do { - struct io_wq_work *work; - - /* - * If we got some work, mark us as busy. If we didn't, but - * the list isn't empty, it means we stalled on hashed work. - * Mark us stalled so we don't keep looking for work when we - * can't make progress, any work completion or insertion will - * clear the stalled flag. - */ - raw_spin_lock(&acct->lock); - work = io_get_next_work(acct, worker); - raw_spin_unlock(&acct->lock); - if (work) { - __io_worker_busy(wqe, worker); - - /* - * Make sure cancelation can find this, even before - * it becomes the active work. That avoids a window - * where the work has been removed from our general - * work list, but isn't yet discoverable as the - * current work item for this worker. - */ - raw_spin_lock(&worker->lock); - worker->next_work = work; - raw_spin_unlock(&worker->lock); - } else { - break; - } - io_assign_current_work(worker, work); - __set_current_state(TASK_RUNNING); - - /* handle a whole dependent link */ - do { - struct io_wq_work *next_hashed, *linked; - unsigned int hash = io_get_work_hash(work); - - next_hashed = wq_next_work(work); - - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(work); - io_assign_current_work(worker, NULL); - - linked = wq->free_work(work); - work = next_hashed; - if (!work && linked && !io_wq_is_hashed(linked)) { - work = linked; - linked = NULL; - } - io_assign_current_work(worker, work); - if (linked) - io_wqe_enqueue(wqe, linked); - - if (hash != -1U && !next_hashed) { - /* serialize hash clear with wake_up() */ - spin_lock_irq(&wq->hash->wait.lock); - clear_bit(hash, &wq->hash->map); - clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - spin_unlock_irq(&wq->hash->wait.lock); - if (wq_has_sleeper(&wq->hash->wait)) - wake_up(&wq->hash->wait); - } - } while (work); - } while (1); -} - -static int io_wqe_worker(void *data) -{ - struct io_worker *worker = data; - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - bool last_timeout = false; - char buf[TASK_COMM_LEN]; - - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); - - snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); - set_task_comm(current, buf); - - audit_alloc_kernel(current); - - while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - long ret; - - set_current_state(TASK_INTERRUPTIBLE); - while (io_acct_run_queue(acct)) - io_worker_handle_work(worker); - - raw_spin_lock(&wqe->lock); - /* timed out, exit unless we're the last worker */ - if (last_timeout && acct->nr_workers > 1) { - acct->nr_workers--; - raw_spin_unlock(&wqe->lock); - __set_current_state(TASK_RUNNING); - break; - } - last_timeout = false; - __io_worker_idle(wqe, worker); - raw_spin_unlock(&wqe->lock); - if (io_flush_signals()) - continue; - ret = schedule_timeout(WORKER_IDLE_TIMEOUT); - if (signal_pending(current)) { - struct ksignal ksig; - - if (!get_signal(&ksig)) - continue; - break; - } - last_timeout = !ret; - } - - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) - io_worker_handle_work(worker); - - audit_free(current); - io_worker_exit(worker); - return 0; -} - -/* - * Called when a worker is scheduled in. Mark us as currently running. - */ -void io_wq_worker_running(struct task_struct *tsk) -{ - struct io_worker *worker = tsk->worker_private; - - if (!worker) - return; - if (!(worker->flags & IO_WORKER_F_UP)) - return; - if (worker->flags & IO_WORKER_F_RUNNING) - return; - worker->flags |= IO_WORKER_F_RUNNING; - io_wqe_inc_running(worker); -} - -/* - * Called when worker is going to sleep. If there are no workers currently - * running and we have work pending, wake up a free one or create a new one. - */ -void io_wq_worker_sleeping(struct task_struct *tsk) -{ - struct io_worker *worker = tsk->worker_private; - - if (!worker) - return; - if (!(worker->flags & IO_WORKER_F_UP)) - return; - if (!(worker->flags & IO_WORKER_F_RUNNING)) - return; - - worker->flags &= ~IO_WORKER_F_RUNNING; - io_wqe_dec_running(worker); -} - -static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, - struct task_struct *tsk) -{ - tsk->worker_private = worker; - worker->task = tsk; - set_cpus_allowed_ptr(tsk, wqe->cpu_mask); - tsk->flags |= PF_NO_SETAFFINITY; - - raw_spin_lock(&wqe->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - list_add_tail_rcu(&worker->all_list, &wqe->all_list); - worker->flags |= IO_WORKER_F_FREE; - raw_spin_unlock(&wqe->lock); - wake_up_new_task(tsk); -} - -static bool io_wq_work_match_all(struct io_wq_work *work, void *data) -{ - return true; -} - -static inline bool io_should_retry_thread(long err) -{ - /* - * Prevent perpetual task_work retry, if the task (or its group) is - * exiting. - */ - if (fatal_signal_pending(current)) - return false; - - switch (err) { - case -EAGAIN: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - case -ERESTARTNOHAND: - return true; - default: - return false; - } -} - -static void create_worker_cont(struct callback_head *cb) -{ - struct io_worker *worker; - struct task_struct *tsk; - struct io_wqe *wqe; - - worker = container_of(cb, struct io_worker, create_work); - clear_bit_unlock(0, &worker->create_state); - wqe = worker->wqe; - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); - if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); - io_worker_release(worker); - return; - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - - atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); - acct->nr_workers--; - if (!acct->nr_workers) { - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_all, - .cancel_all = true, - }; - - raw_spin_unlock(&wqe->lock); - while (io_acct_cancel_pending_work(wqe, acct, &match)) - ; - } else { - raw_spin_unlock(&wqe->lock); - } - io_worker_ref_put(wqe->wq); - kfree(worker); - return; - } - - /* re-create attempts grab a new worker ref, drop the existing one */ - io_worker_release(worker); - schedule_work(&worker->work); -} - -static void io_workqueue_create(struct work_struct *work) -{ - struct io_worker *worker = container_of(work, struct io_worker, work); - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - - if (!io_queue_worker_create(worker, acct, create_worker_cont)) - kfree(worker); -} - -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) -{ - struct io_wqe_acct *acct = &wqe->acct[index]; - struct io_worker *worker; - struct task_struct *tsk; - - __set_current_state(TASK_RUNNING); - - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); - if (!worker) { -fail: - atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); - acct->nr_workers--; - raw_spin_unlock(&wqe->lock); - io_worker_ref_put(wq); - return false; - } - - refcount_set(&worker->ref, 1); - worker->wqe = wqe; - raw_spin_lock_init(&worker->lock); - init_completion(&worker->ref_done); - - if (index == IO_WQ_ACCT_BOUND) - worker->flags |= IO_WORKER_F_BOUND; - - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); - if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { - kfree(worker); - goto fail; - } else { - INIT_WORK(&worker->work, io_workqueue_create); - schedule_work(&worker->work); - } - - return true; -} - -/* - * Iterate the passed in list and call the specific function for each - * worker that isn't exiting - */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, - bool (*func)(struct io_worker *, void *), - void *data) -{ - struct io_worker *worker; - bool ret = false; - - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { - if (io_worker_get(worker)) { - /* no task if node is/was offline */ - if (worker->task) - ret = func(worker, data); - io_worker_release(worker); - if (ret) - break; - } - } - - return ret; -} - -static bool io_wq_worker_wake(struct io_worker *worker, void *data) -{ - __set_notify_signal(worker->task); - wake_up_process(worker->task); - return false; -} - -static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) -{ - struct io_wq *wq = wqe->wq; - - do { - work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(work); - work = wq->free_work(work); - } while (work); -} - -static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) -{ - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - unsigned int hash; - struct io_wq_work *tail; - - if (!io_wq_is_hashed(work)) { -append: - wq_list_add_tail(&work->list, &acct->work_list); - return; - } - - hash = io_get_work_hash(work); - tail = wqe->hash_tail[hash]; - wqe->hash_tail[hash] = work; - if (!tail) - goto append; - - wq_list_add_after(&work->list, &tail->list, &acct->work_list); -} - -static bool io_wq_work_match_item(struct io_wq_work *work, void *data) -{ - return work == data; -} - -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) -{ - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - struct io_cb_cancel_data match; - unsigned work_flags = work->flags; - bool do_create; - - /* - * If io-wq is exiting for this task, or if the request has explicitly - * been marked as one that should not get executed, cancel it here. - */ - if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { - io_run_cancel(work, wqe); - return; - } - - raw_spin_lock(&acct->lock); - io_wqe_insert_work(wqe, work); - clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - raw_spin_unlock(&acct->lock); - - raw_spin_lock(&wqe->lock); - rcu_read_lock(); - do_create = !io_wqe_activate_free_worker(wqe, acct); - rcu_read_unlock(); - - raw_spin_unlock(&wqe->lock); - - if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || - !atomic_read(&acct->nr_running))) { - bool did_create; - - did_create = io_wqe_create_worker(wqe, acct); - if (likely(did_create)) - return; - - raw_spin_lock(&wqe->lock); - if (acct->nr_workers) { - raw_spin_unlock(&wqe->lock); - return; - } - raw_spin_unlock(&wqe->lock); - - /* fatal condition, failed to create the first worker */ - match.fn = io_wq_work_match_item, - match.data = work, - match.cancel_all = false, - - io_acct_cancel_pending_work(wqe, acct, &match); - } -} - -void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) -{ - struct io_wqe *wqe = wq->wqes[numa_node_id()]; - - io_wqe_enqueue(wqe, work); -} - -/* - * Work items that hash to the same value will not be done in parallel. - * Used to limit concurrent writes, generally hashed by inode. - */ -void io_wq_hash_work(struct io_wq_work *work, void *val) -{ - unsigned int bit; - - bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); -} - -static bool __io_wq_worker_cancel(struct io_worker *worker, - struct io_cb_cancel_data *match, - struct io_wq_work *work) -{ - if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; - __set_notify_signal(worker->task); - return true; - } - - return false; -} - -static bool io_wq_worker_cancel(struct io_worker *worker, void *data) -{ - struct io_cb_cancel_data *match = data; - - /* - * Hold the lock to avoid ->cur_work going out of scope, caller - * may dereference the passed in work. - */ - raw_spin_lock(&worker->lock); - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || - __io_wq_worker_cancel(worker, match, worker->next_work)) - match->nr_running++; - raw_spin_unlock(&worker->lock); - - return match->nr_running && !match->cancel_all; -} - -static inline void io_wqe_remove_pending(struct io_wqe *wqe, - struct io_wq_work *work, - struct io_wq_work_node *prev) -{ - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - unsigned int hash = io_get_work_hash(work); - struct io_wq_work *prev_work = NULL; - - if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) { - if (prev) - prev_work = container_of(prev, struct io_wq_work, list); - if (prev_work && io_get_work_hash(prev_work) == hash) - wqe->hash_tail[hash] = prev_work; - else - wqe->hash_tail[hash] = NULL; - } - wq_list_del(&acct->work_list, &work->list, prev); -} - -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, - struct io_cb_cancel_data *match) -{ - struct io_wq_work_node *node, *prev; - struct io_wq_work *work; - - raw_spin_lock(&acct->lock); - wq_list_for_each(node, prev, &acct->work_list) { - work = container_of(node, struct io_wq_work, list); - if (!match->fn(work, match->data)) - continue; - io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock(&acct->lock); - io_run_cancel(work, wqe); - match->nr_pending++; - /* not safe to continue after unlock */ - return true; - } - raw_spin_unlock(&acct->lock); - - return false; -} - -static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) -{ - int i; -retry: - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); - - if (io_acct_cancel_pending_work(wqe, acct, match)) { - if (match->cancel_all) - goto retry; - break; - } - } -} - -static void io_wqe_cancel_running_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) -{ - rcu_read_lock(); - io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); - rcu_read_unlock(); -} - -enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data, bool cancel_all) -{ - struct io_cb_cancel_data match = { - .fn = cancel, - .data = data, - .cancel_all = cancel_all, - }; - int node; - - /* - * First check pending list, if we're lucky we can just remove it - * from there. CANCEL_OK means that the work is returned as-new, - * no completion will be posted for it. - * - * Then check if a free (going busy) or busy worker has the work - * currently running. If we find it there, we'll return CANCEL_RUNNING - * as an indication that we attempt to signal cancellation. The - * completion will run normally in this case. - * - * Do both of these while holding the wqe->lock, to ensure that - * we'll find a work item regardless of state. - */ - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) - return IO_WQ_CANCEL_OK; - - raw_spin_lock(&wqe->lock); - io_wqe_cancel_running_work(wqe, &match); - raw_spin_unlock(&wqe->lock); - if (match.nr_running && !match.cancel_all) - return IO_WQ_CANCEL_RUNNING; - } - - if (match.nr_running) - return IO_WQ_CANCEL_RUNNING; - if (match.nr_pending) - return IO_WQ_CANCEL_OK; - return IO_WQ_CANCEL_NOTFOUND; -} - -static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, - int sync, void *key) -{ - struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); - int i; - - list_del_init(&wait->entry); - - rcu_read_lock(); - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; - - if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wqe_activate_free_worker(wqe, acct); - } - rcu_read_unlock(); - return 1; -} - -struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) -{ - int ret, node, i; - struct io_wq *wq; - - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); - if (WARN_ON_ONCE(!bounded)) - return ERR_PTR(-EINVAL); - - wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL); - if (!wq) - return ERR_PTR(-ENOMEM); - ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) - goto err_wq; - - refcount_inc(&data->hash->refs); - wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; - - ret = -ENOMEM; - for_each_node(node) { - struct io_wqe *wqe; - int alloc_node = node; - - if (!node_online(alloc_node)) - alloc_node = NUMA_NO_NODE; - wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); - if (!wqe) - goto err; - if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) - goto err; - cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); - wq->wqes[node] = wqe; - wqe->node = alloc_node; - wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = - task_rlimit(current, RLIMIT_NPROC); - INIT_LIST_HEAD(&wqe->wait.entry); - wqe->wait.func = io_wqe_hash_wake; - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; - - acct->index = i; - atomic_set(&acct->nr_running, 0); - INIT_WQ_LIST(&acct->work_list); - raw_spin_lock_init(&acct->lock); - } - wqe->wq = wq; - raw_spin_lock_init(&wqe->lock); - INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_LIST_HEAD(&wqe->all_list); - } - - wq->task = get_task_struct(data->task); - atomic_set(&wq->worker_refs, 1); - init_completion(&wq->worker_done); - return wq; -err: - io_wq_put_hash(data->hash); - cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - for_each_node(node) { - if (!wq->wqes[node]) - continue; - free_cpumask_var(wq->wqes[node]->cpu_mask); - kfree(wq->wqes[node]); - } -err_wq: - kfree(wq); - return ERR_PTR(ret); -} - -static bool io_task_work_match(struct callback_head *cb, void *data) -{ - struct io_worker *worker; - - if (cb->func != create_worker_cb && cb->func != create_worker_cont) - return false; - worker = container_of(cb, struct io_worker, create_work); - return worker->wqe->wq == data; -} - -void io_wq_exit_start(struct io_wq *wq) -{ - set_bit(IO_WQ_BIT_EXIT, &wq->state); -} - -static void io_wq_cancel_tw_create(struct io_wq *wq) -{ - struct callback_head *cb; - - while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { - struct io_worker *worker; - - worker = container_of(cb, struct io_worker, create_work); - io_worker_cancel_cb(worker); - } -} - -static void io_wq_exit_workers(struct io_wq *wq) -{ - int node; - - if (!wq->task) - return; - - io_wq_cancel_tw_create(wq); - - rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - } - rcu_read_unlock(); - io_worker_ref_put(wq); - wait_for_completion(&wq->worker_done); - - for_each_node(node) { - spin_lock_irq(&wq->hash->wait.lock); - list_del_init(&wq->wqes[node]->wait.entry); - spin_unlock_irq(&wq->hash->wait.lock); - } - put_task_struct(wq->task); - wq->task = NULL; -} - -static void io_wq_destroy(struct io_wq *wq) -{ - int node; - - cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_all, - .cancel_all = true, - }; - io_wqe_cancel_pending_work(wqe, &match); - free_cpumask_var(wqe->cpu_mask); - kfree(wqe); - } - io_wq_put_hash(wq->hash); - kfree(wq); -} - -void io_wq_put_and_exit(struct io_wq *wq) -{ - WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state)); - - io_wq_exit_workers(wq); - io_wq_destroy(wq); -} - -struct online_data { - unsigned int cpu; - bool online; -}; - -static bool io_wq_worker_affinity(struct io_worker *worker, void *data) -{ - struct online_data *od = data; - - if (od->online) - cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask); - else - cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask); - return false; -} - -static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online) -{ - struct online_data od = { - .cpu = cpu, - .online = online - }; - int i; - - rcu_read_lock(); - for_each_node(i) - io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od); - rcu_read_unlock(); - return 0; -} - -static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node) -{ - struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); - - return __io_wq_cpu_online(wq, cpu, true); -} - -static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) -{ - struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); - - return __io_wq_cpu_online(wq, cpu, false); -} - -int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) -{ - int i; - - rcu_read_lock(); - for_each_node(i) { - struct io_wqe *wqe = wq->wqes[i]; - - if (mask) - cpumask_copy(wqe->cpu_mask, mask); - else - cpumask_copy(wqe->cpu_mask, cpumask_of_node(i)); - } - rcu_read_unlock(); - return 0; -} - -/* - * Set max number of unbounded workers, returns old value. If new_count is 0, - * then just return the old value. - */ -int io_wq_max_workers(struct io_wq *wq, int *new_count) -{ - int prev[IO_WQ_ACCT_NR]; - bool first_node = true; - int i, node; - - BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND); - BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); - BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2); - - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) - new_count[i] = task_rlimit(current, RLIMIT_NPROC); - } - - for (i = 0; i < IO_WQ_ACCT_NR; i++) - prev[i] = 0; - - rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_wqe_acct *acct; - - raw_spin_lock(&wqe->lock); - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wqe->acct[i]; - if (first_node) - prev[i] = max_t(int, acct->max_workers, prev[i]); - if (new_count[i]) - acct->max_workers = new_count[i]; - } - raw_spin_unlock(&wqe->lock); - first_node = false; - } - rcu_read_unlock(); - - for (i = 0; i < IO_WQ_ACCT_NR; i++) - new_count[i] = prev[i]; - - return 0; -} - -static __init int io_wq_init(void) -{ - int ret; - - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online", - io_wq_cpu_online, io_wq_cpu_offline); - if (ret < 0) - return ret; - io_wq_online = ret; - return 0; -} -subsys_initcall(io_wq_init); diff --git a/fs/io-wq.h b/fs/io-wq.h deleted file mode 100644 index ba6eee76d028..000000000000 --- a/fs/io-wq.h +++ /dev/null @@ -1,228 +0,0 @@ -#ifndef INTERNAL_IO_WQ_H -#define INTERNAL_IO_WQ_H - -#include <linux/refcount.h> - -struct io_wq; - -enum { - IO_WQ_WORK_CANCEL = 1, - IO_WQ_WORK_HASHED = 2, - IO_WQ_WORK_UNBOUND = 4, - IO_WQ_WORK_CONCURRENT = 16, - - IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ -}; - -enum io_wq_cancel { - IO_WQ_CANCEL_OK, /* cancelled before started */ - IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */ - IO_WQ_CANCEL_NOTFOUND, /* work not found */ -}; - -struct io_wq_work_node { - struct io_wq_work_node *next; -}; - -struct io_wq_work_list { - struct io_wq_work_node *first; - struct io_wq_work_node *last; -}; - -#define wq_list_for_each(pos, prv, head) \ - for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) - -#define wq_list_for_each_resume(pos, prv) \ - for (; pos; prv = pos, pos = (pos)->next) - -#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) -#define INIT_WQ_LIST(list) do { \ - (list)->first = NULL; \ -} while (0) - -static inline void wq_list_add_after(struct io_wq_work_node *node, - struct io_wq_work_node *pos, - struct io_wq_work_list *list) -{ - struct io_wq_work_node *next = pos->next; - - pos->next = node; - node->next = next; - if (!next) - list->last = node; -} - -/** - * wq_list_merge - merge the second list to the first one. - * @list0: the first list - * @list1: the second list - * Return the first node after mergence. - */ -static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, - struct io_wq_work_list *list1) -{ - struct io_wq_work_node *ret; - - if (!list0->first) { - ret = list1->first; - } else { - ret = list0->first; - list0->last->next = list1->first; - } - INIT_WQ_LIST(list0); - INIT_WQ_LIST(list1); - return ret; -} - -static inline void wq_list_add_tail(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = NULL; - if (!list->first) { - list->last = node; - WRITE_ONCE(list->first, node); - } else { - list->last->next = node; - list->last = node; - } -} - -static inline void wq_list_add_head(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = list->first; - if (!node->next) - list->last = node; - WRITE_ONCE(list->first, node); -} - -static inline void wq_list_cut(struct io_wq_work_list *list, - struct io_wq_work_node *last, - struct io_wq_work_node *prev) -{ - /* first in the list, if prev==NULL */ - if (!prev) - WRITE_ONCE(list->first, last->next); - else - prev->next = last->next; - - if (last == list->last) - list->last = prev; - last->next = NULL; -} - -static inline void __wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - list->last->next = to->next; - to->next = list->first; - INIT_WQ_LIST(list); -} - -static inline bool wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - if (!wq_list_empty(list)) { - __wq_list_splice(list, to); - return true; - } - return false; -} - -static inline void wq_stack_add_head(struct io_wq_work_node *node, - struct io_wq_work_node *stack) -{ - node->next = stack->next; - stack->next = node; -} - -static inline void wq_list_del(struct io_wq_work_list *list, - struct io_wq_work_node *node, - struct io_wq_work_node *prev) -{ - wq_list_cut(list, node, prev); -} - -static inline -struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) -{ - struct io_wq_work_node *node = stack->next; - - stack->next = node->next; - return node; -} - -struct io_wq_work { - struct io_wq_work_node list; - unsigned flags; - int cancel_seq; -}; - -static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) -{ - if (!work->list.next) - return NULL; - - return container_of(work->list.next, struct io_wq_work, list); -} - -typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work *); - -struct io_wq_hash { - refcount_t refs; - unsigned long map; - struct wait_queue_head wait; -}; - -static inline void io_wq_put_hash(struct io_wq_hash *hash) -{ - if (refcount_dec_and_test(&hash->refs)) - kfree(hash); -} - -struct io_wq_data { - struct io_wq_hash *hash; - struct task_struct *task; - io_wq_work_fn *do_work; - free_work_fn *free_work; -}; - -struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); -void io_wq_exit_start(struct io_wq *wq); -void io_wq_put_and_exit(struct io_wq *wq); - -void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); -void io_wq_hash_work(struct io_wq_work *work, void *val); - -int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); -int io_wq_max_workers(struct io_wq *wq, int *new_count); - -static inline bool io_wq_is_hashed(struct io_wq_work *work) -{ - return work->flags & IO_WQ_WORK_HASHED; -} - -typedef bool (work_cancel_fn)(struct io_wq_work *, void *); - -enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data, bool cancel_all); - -#if defined(CONFIG_IO_WQ) -extern void io_wq_worker_sleeping(struct task_struct *); -extern void io_wq_worker_running(struct task_struct *); -#else -static inline void io_wq_worker_sleeping(struct task_struct *tsk) -{ -} -static inline void io_wq_worker_running(struct task_struct *tsk) -{ -} -#endif - -static inline bool io_wq_current_is_worker(void) -{ - return in_task() && (current->flags & PF_IO_WORKER) && - current->worker_private; -} -#endif diff --git a/fs/io_uring.c b/fs/io_uring.c deleted file mode 100644 index 3aab4182fd89..000000000000 --- a/fs/io_uring.c +++ /dev/null @@ -1,13316 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared application/kernel submission and completion ring pairs, for - * supporting fast/efficient IO. - * - * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. - * - * After the application reads the CQ ring tail, it must use an - * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses - * before writing the tail (using smp_load_acquire to read the tail will - * do). It also needs a smp_mb() before updating CQ head (ordering the - * entry load(s) with the head store), pairing with an implicit barrier - * through a control-dependency in io_get_cqe (smp_store_release to - * store head will do). Failure to do so could lead to reading invalid - * CQ entries. - * - * Likewise, the application must use an appropriate smp_wmb() before - * writing the SQ tail (ordering SQ entry stores with the tail store), - * which pairs with smp_load_acquire in io_get_sqring (smp_store_release - * to store the tail will do). And it needs a barrier ordering the SQ - * head load before writing new SQ entries (smp_load_acquire to read - * head will do). - * - * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application - * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* - * updating the SQ tail; a full memory barrier smp_mb() is needed - * between. - * - * Also see the examples in the liburing library: - * - * git://git.kernel.dk/liburing - * - * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens - * from data shared between the kernel and application. This is done both - * for ordering purposes, but also to ensure that once a value is loaded from - * data that the application could potentially modify, it remains stable. - * - * Copyright (C) 2018-2019 Jens Axboe - * Copyright (c) 2018-2019 Christoph Hellwig - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/syscalls.h> -#include <linux/compat.h> -#include <net/compat.h> -#include <linux/refcount.h> -#include <linux/uio.h> -#include <linux/bits.h> - -#include <linux/sched/signal.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/blk-mq.h> -#include <linux/bvec.h> -#include <linux/net.h> -#include <net/sock.h> -#include <net/af_unix.h> -#include <net/scm.h> -#include <linux/anon_inodes.h> -#include <linux/sched/mm.h> -#include <linux/uaccess.h> -#include <linux/nospec.h> -#include <linux/sizes.h> -#include <linux/hugetlb.h> -#include <linux/highmem.h> -#include <linux/namei.h> -#include <linux/fsnotify.h> -#include <linux/fadvise.h> -#include <linux/eventpoll.h> -#include <linux/splice.h> -#include <linux/task_work.h> -#include <linux/pagemap.h> -#include <linux/io_uring.h> -#include <linux/audit.h> -#include <linux/security.h> -#include <linux/xattr.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/io_uring.h> - -#include <uapi/linux/io_uring.h> - -#include "internal.h" -#include "io-wq.h" - -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 - -/* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 20) -#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ - IORING_REGISTER_LAST + IORING_OP_LAST) - -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) - -#define IORING_MAX_REG_BUFFERS (1U << 14) - -#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC) - -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) - -#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) - -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) - -#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) - -#define IO_TCTX_REFS_CACHE_NR (1U << 10) - -struct io_uring { - u32 head ____cacheline_aligned_in_smp; - u32 tail ____cacheline_aligned_in_smp; -}; - -/* - * This data is shared with the application through the mmap at offsets - * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_sqring_offsets when calling io_uring_setup. - */ -struct io_rings { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The kernel controls head of the sq ring and the tail of the cq ring, - * and the application controls tail of the sq ring and the head of the - * cq ring. - */ - struct io_uring sq, cq; - /* - * Bitmasks to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 sq_ring_mask, cq_ring_mask; - /* Ring sizes (constant, power of 2) */ - u32 sq_ring_entries, cq_ring_entries; - /* - * Number of invalid entries dropped by the kernel due to - * invalid index stored in array - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * After a new SQ head value was read by the application this - * counter includes all submissions that were dropped reaching - * the new SQ head (and possibly more). - */ - u32 sq_dropped; - /* - * Runtime SQ flags - * - * Written by the kernel, shouldn't be modified by the - * application. - * - * The application needs a full memory barrier before checking - * for IORING_SQ_NEED_WAKEUP after updating the sq tail. - */ - atomic_t sq_flags; - /* - * Runtime CQ flags - * - * Written by the application, shouldn't be modified by the - * kernel. - */ - u32 cq_flags; - /* - * Number of completion events lost because the queue was full; - * this should be avoided by the application by making sure - * there are not more requests pending than there is space in - * the completion queue. - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * As completion events come in out of order this counter is not - * ordered with any other data. - */ - u32 cq_overflow; - /* - * Ring buffer of completion events. - * - * The kernel writes completion events fresh every time they are - * produced, so the application is allowed to modify pending - * entries. - */ - struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; -}; - -struct io_mapped_ubuf { - u64 ubuf; - u64 ubuf_end; - unsigned int nr_bvecs; - unsigned long acct_pages; - struct bio_vec bvec[]; -}; - -struct io_ring_ctx; - -struct io_overflow_cqe { - struct list_head list; - struct io_uring_cqe cqe; -}; - -/* - * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 - * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we - * can't safely always dereference the file when the task has exited and ring - * cleanup is done. If a file is tracked and part of SCM, then unix gc on - * process exit may reap it before __io_sqe_files_unregister() is run. - */ -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#if defined(CONFIG_64BIT) -#define FFS_SCM 0x4UL -#else -#define IO_URING_SCM_ALL -#define FFS_SCM 0x0UL -#endif -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) - -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - -struct io_rsrc_put { - struct list_head list; - u64 tag; - union { - void *rsrc; - struct file *file; - struct io_mapped_ubuf *buf; - }; -}; - -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - -struct io_rsrc_node { - struct percpu_ref refs; - struct list_head node; - struct list_head rsrc_list; - struct io_rsrc_data *rsrc_data; - struct llist_node llist; - bool done; -}; - -typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - -struct io_rsrc_data { - struct io_ring_ctx *ctx; - - u64 **tags; - unsigned int nr; - rsrc_put_fn *do_put; - atomic_t refs; - struct completion done; - bool quiesce; -}; - -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -struct io_buffer_list { - /* - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, - * then these are classic provided buffers and ->buf_list is used. - */ - union { - struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - }; - __u16 bgid; - - /* below is for ring provided buffers */ - __u16 buf_nr_pages; - __u16 nr_entries; - __u32 head; - __u32 mask; -}; - -struct io_buffer { - struct list_head list; - __u64 addr; - __u32 len; - __u16 bid; - __u16 bgid; -}; - -struct io_restriction { - DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); - DECLARE_BITMAP(sqe_op, IORING_OP_LAST); - u8 sqe_flags_allowed; - u8 sqe_flags_required; - bool registered; -}; - -enum { - IO_SQ_THREAD_SHOULD_STOP = 0, - IO_SQ_THREAD_SHOULD_PARK, -}; - -struct io_sq_data { - refcount_t refs; - atomic_t park_pending; - struct mutex lock; - - /* ctx's that are using this sqd */ - struct list_head ctx_list; - - struct task_struct *thread; - struct wait_queue_head wait; - - unsigned sq_thread_idle; - int sq_cpu; - pid_t task_pid; - pid_t task_tgid; - - unsigned long state; - struct completion exited; -}; - -#define IO_COMPL_BATCH 32 -#define IO_REQ_CACHE_SIZE 32 -#define IO_REQ_ALLOC_BATCH 8 - -struct io_submit_link { - struct io_kiocb *head; - struct io_kiocb *last; -}; - -struct io_submit_state { - /* inline/task_work completion list, under ->uring_lock */ - struct io_wq_work_node free_list; - /* batch completion logic */ - struct io_wq_work_list compl_reqs; - struct io_submit_link link; - - bool plug_started; - bool need_plug; - bool flush_cqes; - unsigned short submit_nr; - struct blk_plug plug; -}; - -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; -}; - -#define BGID_ARRAY 64 - -struct io_ring_ctx { - /* const or read-mostly hot data */ - struct { - struct percpu_ref refs; - - struct io_rings *rings; - unsigned int flags; - enum task_work_notify_mode notify_method; - unsigned int compat: 1; - unsigned int drain_next: 1; - unsigned int restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int drain_disabled: 1; - unsigned int has_evfd: 1; - unsigned int syscall_iopoll: 1; - } ____cacheline_aligned_in_smp; - - /* submission data */ - struct { - struct mutex uring_lock; - - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 *sq_array; - struct io_uring_sqe *sq_sqes; - unsigned cached_sq_head; - unsigned sq_entries; - struct list_head defer_list; - - /* - * Fixed resources fast path, should be accessed only under - * uring_lock, and updated through io_uring_register(2) - */ - struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; - atomic_t cancel_seq; - struct io_file_table file_table; - unsigned nr_user_files; - unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; - - struct io_submit_state submit_state; - - struct io_buffer_list *io_bl; - struct xarray io_bl_xa; - struct list_head io_buffers_cache; - - struct list_head timeout_list; - struct list_head ltimeout_list; - struct list_head cq_overflow_list; - struct list_head apoll_cache; - struct xarray personalities; - u32 pers_next; - unsigned sq_thread_idle; - } ____cacheline_aligned_in_smp; - - /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; - unsigned int locked_free_nr; - - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ - struct io_sq_data *sq_data; /* if using sq thread polling */ - - struct wait_queue_head sqo_sq_wait; - struct list_head sqd_list; - - unsigned long check_cq; - - struct { - /* - * We cache a range of free CQEs we can use, once exhausted it - * should go through a slower range setup, see __io_get_cqe() - */ - struct io_uring_cqe *cqe_cached; - struct io_uring_cqe *cqe_sentinel; - - unsigned cached_cq_tail; - unsigned cq_entries; - struct io_ev_fd __rcu *io_ev_fd; - struct wait_queue_head cq_wait; - unsigned cq_extra; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; - } ____cacheline_aligned_in_smp; - - struct { - spinlock_t completion_lock; - - spinlock_t timeout_lock; - - /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - struct hlist_head *cancel_hash; - unsigned cancel_hash_bits; - bool poll_multi_queue; - - struct list_head io_buffers_comp; - } ____cacheline_aligned_in_smp; - - struct io_restriction restrictions; - - /* slow path rsrc auxilary data, used by update/register */ - struct { - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_rsrc_data *file_data; - struct io_rsrc_data *buf_data; - - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; - - struct list_head io_buffers_pages; - }; - - /* Keep this last, we don't need it for the fast path */ - struct { - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif - /* hashed buffered write serialization */ - struct io_wq_hash *hash_map; - - /* Only used for accounting purposes */ - struct user_struct *user; - struct mm_struct *mm_account; - - /* ctx exit and cancelation */ - struct llist_head fallback_llist; - struct delayed_work fallback_work; - struct work_struct exit_work; - struct list_head tctx_list; - struct completion ref_comp; - u32 iowq_limits[2]; - bool iowq_limits_set; - }; -}; - -/* - * Arbitrary limit, can be raised if need be - */ -#define IO_RINGFD_REG_MAX 16 - -struct io_uring_task { - /* submission side */ - int cached_refs; - struct xarray xa; - struct wait_queue_head wait; - const struct io_ring_ctx *last; - struct io_wq *io_wq; - struct percpu_counter inflight; - atomic_t inflight_tracked; - atomic_t in_idle; - - spinlock_t task_lock; - struct io_wq_work_list task_list; - struct io_wq_work_list prio_task_list; - struct callback_head task_work; - struct file **registered_rings; - bool task_running; -}; - -/* - * First field must be the file pointer in all the - * iocb unions! See also 'struct kiocb' in <linux/fs.h> - */ -struct io_poll_iocb { - struct file *file; - struct wait_queue_head *head; - __poll_t events; - struct wait_queue_entry wait; -}; - -struct io_poll_update { - struct file *file; - u64 old_user_data; - u64 new_user_data; - __poll_t events; - bool update_events; - bool update_user_data; -}; - -struct io_close { - struct file *file; - int fd; - u32 file_slot; - u32 flags; -}; - -struct io_timeout_data { - struct io_kiocb *req; - struct hrtimer timer; - struct timespec64 ts; - enum hrtimer_mode mode; - u32 flags; -}; - -struct io_accept { - struct file *file; - struct sockaddr __user *addr; - int __user *addr_len; - int flags; - u32 file_slot; - unsigned long nofile; -}; - -struct io_socket { - struct file *file; - int domain; - int type; - int protocol; - int flags; - u32 file_slot; - unsigned long nofile; -}; - -struct io_sync { - struct file *file; - loff_t len; - loff_t off; - int flags; - int mode; -}; - -struct io_cancel { - struct file *file; - u64 addr; - u32 flags; - s32 fd; -}; - -struct io_timeout { - struct file *file; - u32 off; - u32 target_seq; - struct list_head list; - /* head of the link, used by linked timeouts only */ - struct io_kiocb *head; - /* for linked completions */ - struct io_kiocb *prev; -}; - -struct io_timeout_rem { - struct file *file; - u64 addr; - - /* timeout update */ - struct timespec64 ts; - u32 flags; - bool ltimeout; -}; - -struct io_rw { - /* NOTE: kiocb has the file as the first member, so don't do it here */ - struct kiocb kiocb; - u64 addr; - u32 len; - rwf_t flags; -}; - -struct io_connect { - struct file *file; - struct sockaddr __user *addr; - int addr_len; -}; - -struct io_sr_msg { - struct file *file; - union { - struct compat_msghdr __user *umsg_compat; - struct user_msghdr __user *umsg; - void __user *buf; - }; - int msg_flags; - size_t len; - size_t done_io; - unsigned int flags; -}; - -struct io_open { - struct file *file; - int dfd; - u32 file_slot; - struct filename *filename; - struct open_how how; - unsigned long nofile; -}; - -struct io_rsrc_update { - struct file *file; - u64 arg; - u32 nr_args; - u32 offset; -}; - -struct io_fadvise { - struct file *file; - u64 offset; - u32 len; - u32 advice; -}; - -struct io_madvise { - struct file *file; - u64 addr; - u32 len; - u32 advice; -}; - -struct io_epoll { - struct file *file; - int epfd; - int op; - int fd; - struct epoll_event event; -}; - -struct io_splice { - struct file *file_out; - loff_t off_out; - loff_t off_in; - u64 len; - int splice_fd_in; - unsigned int flags; -}; - -struct io_provide_buf { - struct file *file; - __u64 addr; - __u32 len; - __u32 bgid; - __u16 nbufs; - __u16 bid; -}; - -struct io_statx { - struct file *file; - int dfd; - unsigned int mask; - unsigned int flags; - struct filename *filename; - struct statx __user *buffer; -}; - -struct io_shutdown { - struct file *file; - int how; -}; - -struct io_rename { - struct file *file; - int old_dfd; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; - int flags; -}; - -struct io_unlink { - struct file *file; - int dfd; - int flags; - struct filename *filename; -}; - -struct io_mkdir { - struct file *file; - int dfd; - umode_t mode; - struct filename *filename; -}; - -struct io_symlink { - struct file *file; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; -}; - -struct io_hardlink { - struct file *file; - int old_dfd; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; - int flags; -}; - -struct io_msg { - struct file *file; - u64 user_data; - u32 len; -}; - -struct io_nop { - struct file *file; - u64 extra1; - u64 extra2; -}; - -struct io_async_connect { - struct sockaddr_storage address; -}; - -struct io_async_msghdr { - struct iovec fast_iov[UIO_FASTIOV]; - /* points to an allocated iov, if NULL we use fast_iov instead */ - struct iovec *free_iov; - struct sockaddr __user *uaddr; - struct msghdr msg; - struct sockaddr_storage addr; -}; - -struct io_rw_state { - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; -}; - -struct io_async_rw { - struct io_rw_state s; - const struct iovec *free_iovec; - size_t bytes_done; - struct wait_page_queue wpq; -}; - -struct io_xattr { - struct file *file; - struct xattr_ctx ctx; - struct filename *filename; -}; - -enum { - REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, - REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, - REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, - REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, - REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, - REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, - - /* first byte is taken by user flags, shift it to not overlap */ - REQ_F_FAIL_BIT = 8, - REQ_F_INFLIGHT_BIT, - REQ_F_CUR_POS_BIT, - REQ_F_NOWAIT_BIT, - REQ_F_LINK_TIMEOUT_BIT, - REQ_F_NEED_CLEANUP_BIT, - REQ_F_POLLED_BIT, - REQ_F_BUFFER_SELECTED_BIT, - REQ_F_BUFFER_RING_BIT, - REQ_F_COMPLETE_INLINE_BIT, - REQ_F_REISSUE_BIT, - REQ_F_CREDS_BIT, - REQ_F_REFCOUNT_BIT, - REQ_F_ARM_LTIMEOUT_BIT, - REQ_F_ASYNC_DATA_BIT, - REQ_F_SKIP_LINK_CQES_BIT, - REQ_F_SINGLE_POLL_BIT, - REQ_F_DOUBLE_POLL_BIT, - REQ_F_PARTIAL_IO_BIT, - REQ_F_APOLL_MULTISHOT_BIT, - /* keep async read/write and isreg together and in order */ - REQ_F_SUPPORT_NOWAIT_BIT, - REQ_F_ISREG_BIT, - - /* not a real bit, just to check we're not overflowing the space */ - __REQ_F_LAST_BIT, -}; - -enum { - /* ctx owns file */ - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), - /* drain existing IO first */ - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), - /* linked sqes */ - REQ_F_LINK = BIT(REQ_F_LINK_BIT), - /* doesn't sever on completion < 0 */ - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), - /* IOSQE_ASYNC */ - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), - /* IOSQE_BUFFER_SELECT */ - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), - /* IOSQE_CQE_SKIP_SUCCESS */ - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), - - /* fail rest of links */ - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), - /* on inflight list, should be cancelled and waited on exit reliably */ - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), - /* read/write uses file position */ - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), - /* must not punt to workers */ - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), - /* has or had linked timeout */ - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* needs cleanup */ - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* already went through poll handler */ - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), - /* buffer already selected */ - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* buffer selected from ring, needs commit */ - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), - /* completion is deferred through io_comp_state */ - REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), - /* caller should reissue async */ - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), - /* supports async reads/writes */ - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), - /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* has creds assigned */ - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), - /* skip refcounting if not set */ - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), - /* there is a linked timeout that has to be armed */ - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), - /* ->async_data allocated */ - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), - /* don't post CQEs while failing linked requests */ - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), - /* single poll may be active */ - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), - /* double poll may active */ - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), - /* request has already done partial IO */ - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), - /* fast poll multishot mode */ - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), -}; - -struct async_poll { - struct io_poll_iocb poll; - struct io_poll_iocb *double_poll; -}; - -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); - -struct io_task_work { - union { - struct io_wq_work_node node; - struct llist_node fallback_node; - }; - io_req_tw_func_t func; -}; - -enum { - IORING_RSRC_FILE = 0, - IORING_RSRC_BUFFER = 1, -}; - -struct io_cqe { - __u64 user_data; - __s32 res; - /* fd initially, then cflags for completion */ - union { - __u32 flags; - int fd; - }; -}; - -enum { - IO_CHECK_CQ_OVERFLOW_BIT, - IO_CHECK_CQ_DROPPED_BIT, -}; - -/* - * NOTE! Each of the iocb union members has the file pointer - * as the first entry in their struct definition. So you can - * access the file pointer through any of the sub-structs, - * or directly as just 'file' in this struct. - */ -struct io_kiocb { - union { - struct file *file; - struct io_rw rw; - struct io_poll_iocb poll; - struct io_poll_update poll_update; - struct io_accept accept; - struct io_sync sync; - struct io_cancel cancel; - struct io_timeout timeout; - struct io_timeout_rem timeout_rem; - struct io_connect connect; - struct io_sr_msg sr_msg; - struct io_open open; - struct io_close close; - struct io_rsrc_update rsrc_update; - struct io_fadvise fadvise; - struct io_madvise madvise; - struct io_epoll epoll; - struct io_splice splice; - struct io_provide_buf pbuf; - struct io_statx statx; - struct io_shutdown shutdown; - struct io_rename rename; - struct io_unlink unlink; - struct io_mkdir mkdir; - struct io_symlink symlink; - struct io_hardlink hardlink; - struct io_msg msg; - struct io_xattr xattr; - struct io_socket sock; - struct io_nop nop; - struct io_uring_cmd uring_cmd; - }; - - u8 opcode; - /* polled IO has completed */ - u8 iopoll_completed; - /* - * Can be either a fixed buffer index, or used with provided buffers. - * For the latter, before issue it points to the buffer group ID, - * and after selection it points to the buffer ID itself. - */ - u16 buf_index; - unsigned int flags; - - struct io_cqe cqe; - - struct io_ring_ctx *ctx; - struct task_struct *task; - - struct io_rsrc_node *rsrc_node; - - union { - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; - - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; - - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - }; - - union { - /* used by request caches, completion batching and iopoll */ - struct io_wq_work_node comp_list; - /* cache ->apoll->events */ - __poll_t apoll_events; - }; - atomic_t refs; - atomic_t poll_refs; - struct io_task_work io_task_work; - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - union { - struct hlist_node hash_node; - struct { - u64 extra1; - u64 extra2; - }; - }; - /* internal polling, see IORING_FEAT_FAST_POLL */ - struct async_poll *apoll; - /* opcode allocated if it needs to store data for async defer */ - void *async_data; - /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ - struct io_kiocb *link; - /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; - struct io_wq_work work; -}; - -struct io_tctx_node { - struct list_head ctx_node; - struct task_struct *task; - struct io_ring_ctx *ctx; -}; - -struct io_defer_entry { - struct list_head list; - struct io_kiocb *req; - u32 seq; -}; - -struct io_cancel_data { - struct io_ring_ctx *ctx; - union { - u64 data; - struct file *file; - }; - u32 flags; - int seq; -}; - -/* - * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into - * the following sqe if SQE128 is used. - */ -#define uring_cmd_pdu_size(is_sqe128) \ - ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ - offsetof(struct io_uring_sqe, cmd)) - -struct io_op_def { - /* needs req->file assigned */ - unsigned needs_file : 1; - /* should block plug */ - unsigned plug : 1; - /* hash wq insertion if file is a regular file */ - unsigned hash_reg_file : 1; - /* unbound wq insertion if file is a non-regular file */ - unsigned unbound_nonreg_file : 1; - /* set if opcode supports polled "wait" */ - unsigned pollin : 1; - unsigned pollout : 1; - unsigned poll_exclusive : 1; - /* op supports buffer selection */ - unsigned buffer_select : 1; - /* do prep async if is going to be punted */ - unsigned needs_async_setup : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; - /* skip auditing */ - unsigned audit_skip : 1; - /* supports ioprio */ - unsigned ioprio : 1; - /* supports iopoll */ - unsigned iopoll : 1; - /* size of async data needed, if any */ - unsigned short async_size; -}; - -static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = { - .audit_skip = 1, - .iopoll = 1, - .buffer_select = 1, - }, - [IORING_OP_READV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .needs_async_setup = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITEV] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_FSYNC] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_READ_FIXED] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITE_FIXED] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_POLL_ADD] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_POLL_REMOVE] = { - .audit_skip = 1, - }, - [IORING_OP_SYNC_FILE_RANGE] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SENDMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .async_size = sizeof(struct io_async_msghdr), - }, - [IORING_OP_RECVMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .needs_async_setup = 1, - .async_size = sizeof(struct io_async_msghdr), - }, - [IORING_OP_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - }, - [IORING_OP_TIMEOUT_REMOVE] = { - /* used by timeout updates' prep() */ - .audit_skip = 1, - }, - [IORING_OP_ACCEPT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .poll_exclusive = 1, - .ioprio = 1, /* used for flags */ - }, - [IORING_OP_ASYNC_CANCEL] = { - .audit_skip = 1, - }, - [IORING_OP_LINK_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - }, - [IORING_OP_CONNECT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .async_size = sizeof(struct io_async_connect), - }, - [IORING_OP_FALLOCATE] = { - .needs_file = 1, - }, - [IORING_OP_OPENAT] = {}, - [IORING_OP_CLOSE] = {}, - [IORING_OP_FILES_UPDATE] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_STATX] = { - .audit_skip = 1, - }, - [IORING_OP_READ] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_FADVISE] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_MADVISE] = {}, - [IORING_OP_SEND] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .audit_skip = 1, - }, - [IORING_OP_RECV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .audit_skip = 1, - }, - [IORING_OP_OPENAT2] = { - }, - [IORING_OP_EPOLL_CTL] = { - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SPLICE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_PROVIDE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_REMOVE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_TEE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SHUTDOWN] = { - .needs_file = 1, - }, - [IORING_OP_RENAMEAT] = {}, - [IORING_OP_UNLINKAT] = {}, - [IORING_OP_MKDIRAT] = {}, - [IORING_OP_SYMLINKAT] = {}, - [IORING_OP_LINKAT] = {}, - [IORING_OP_MSG_RING] = { - .needs_file = 1, - .iopoll = 1, - }, - [IORING_OP_FSETXATTR] = { - .needs_file = 1 - }, - [IORING_OP_SETXATTR] = {}, - [IORING_OP_FGETXATTR] = { - .needs_file = 1 - }, - [IORING_OP_GETXATTR] = {}, - [IORING_OP_SOCKET] = { - .audit_skip = 1, - }, - [IORING_OP_URING_CMD] = { - .needs_file = 1, - .plug = 1, - .needs_async_setup = 1, - .async_size = uring_cmd_pdu_size(1), - }, -}; - -/* requests with any of those set should undergo io_disarm_next() */ -#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) - -static bool io_disarm_next(struct io_kiocb *req); -static void io_uring_del_tctx_node(unsigned long index); -static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all); -static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); - -static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); -static void io_dismantle_req(struct io_kiocb *req); -static void io_queue_linked_timeout(struct io_kiocb *req); -static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, - struct io_uring_rsrc_update2 *up, - unsigned nr_args); -static void io_clean_op(struct io_kiocb *req); -static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned issue_flags); -static struct file *io_file_get_normal(struct io_kiocb *req, int fd); -static void io_queue_sqe(struct io_kiocb *req); -static void io_rsrc_put_work(struct work_struct *work); - -static void io_req_task_queue(struct io_kiocb *req); -static void __io_submit_flush_completions(struct io_ring_ctx *ctx); -static int io_req_prep_async(struct io_kiocb *req); - -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index); -static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, - unsigned int offset); -static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); - -static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); -static void io_eventfd_signal(struct io_ring_ctx *ctx); -static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); - -static struct kmem_cache *req_cachep; - -static const struct file_operations io_uring_fops; - -const char *io_uring_get_opcode(u8 opcode) -{ - switch ((enum io_uring_op)opcode) { - case IORING_OP_NOP: - return "NOP"; - case IORING_OP_READV: - return "READV"; - case IORING_OP_WRITEV: - return "WRITEV"; - case IORING_OP_FSYNC: - return "FSYNC"; - case IORING_OP_READ_FIXED: - return "READ_FIXED"; - case IORING_OP_WRITE_FIXED: - return "WRITE_FIXED"; - case IORING_OP_POLL_ADD: - return "POLL_ADD"; - case IORING_OP_POLL_REMOVE: - return "POLL_REMOVE"; - case IORING_OP_SYNC_FILE_RANGE: - return "SYNC_FILE_RANGE"; - case IORING_OP_SENDMSG: - return "SENDMSG"; - case IORING_OP_RECVMSG: - return "RECVMSG"; - case IORING_OP_TIMEOUT: - return "TIMEOUT"; - case IORING_OP_TIMEOUT_REMOVE: - return "TIMEOUT_REMOVE"; - case IORING_OP_ACCEPT: - return "ACCEPT"; - case IORING_OP_ASYNC_CANCEL: - return "ASYNC_CANCEL"; - case IORING_OP_LINK_TIMEOUT: - return "LINK_TIMEOUT"; - case IORING_OP_CONNECT: - return "CONNECT"; - case IORING_OP_FALLOCATE: - return "FALLOCATE"; - case IORING_OP_OPENAT: - return "OPENAT"; - case IORING_OP_CLOSE: - return "CLOSE"; - case IORING_OP_FILES_UPDATE: - return "FILES_UPDATE"; - case IORING_OP_STATX: - return "STATX"; - case IORING_OP_READ: - return "READ"; - case IORING_OP_WRITE: - return "WRITE"; - case IORING_OP_FADVISE: - return "FADVISE"; - case IORING_OP_MADVISE: - return "MADVISE"; - case IORING_OP_SEND: - return "SEND"; - case IORING_OP_RECV: - return "RECV"; - case IORING_OP_OPENAT2: - return "OPENAT2"; - case IORING_OP_EPOLL_CTL: - return "EPOLL_CTL"; - case IORING_OP_SPLICE: - return "SPLICE"; - case IORING_OP_PROVIDE_BUFFERS: - return "PROVIDE_BUFFERS"; - case IORING_OP_REMOVE_BUFFERS: - return "REMOVE_BUFFERS"; - case IORING_OP_TEE: - return "TEE"; - case IORING_OP_SHUTDOWN: - return "SHUTDOWN"; - case IORING_OP_RENAMEAT: - return "RENAMEAT"; - case IORING_OP_UNLINKAT: - return "UNLINKAT"; - case IORING_OP_MKDIRAT: - return "MKDIRAT"; - case IORING_OP_SYMLINKAT: - return "SYMLINKAT"; - case IORING_OP_LINKAT: - return "LINKAT"; - case IORING_OP_MSG_RING: - return "MSG_RING"; - case IORING_OP_FSETXATTR: - return "FSETXATTR"; - case IORING_OP_SETXATTR: - return "SETXATTR"; - case IORING_OP_FGETXATTR: - return "FGETXATTR"; - case IORING_OP_GETXATTR: - return "GETXATTR"; - case IORING_OP_SOCKET: - return "SOCKET"; - case IORING_OP_URING_CMD: - return "URING_CMD"; - case IORING_OP_LAST: - return "INVALID"; - } - return "INVALID"; -} - -struct sock *io_uring_get_socket(struct file *file) -{ -#if defined(CONFIG_UNIX) - if (file->f_op == &io_uring_fops) { - struct io_ring_ctx *ctx = file->private_data; - - return ctx->ring_sock->sk; - } -#endif - return NULL; -} -EXPORT_SYMBOL(io_uring_get_socket); - -#if defined(CONFIG_UNIX) -static inline bool io_file_need_scm(struct file *filp) -{ -#if defined(IO_URING_SCM_ALL) - return true; -#else - return !!unix_get_socket(filp); -#endif -} -#else -static inline bool io_file_need_scm(struct file *filp) -{ - return false; -} -#endif - -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) -{ - lockdep_assert_held(&ctx->uring_lock); - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) -{ - /* - * "Normal" inline submissions always hold the uring_lock, since we - * grab it from the system call. Same is true for the SQPOLL offload. - * The only exception is when we've detached the request and issue it - * from an async worker thread, grab the lock for that case. - */ - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); - lockdep_assert_held(&ctx->uring_lock); -} - -static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) -{ - if (!*locked) { - mutex_lock(&ctx->uring_lock); - *locked = true; - } -} - -#define io_for_each_link(pos, head) \ - for (pos = (head); pos; pos = pos->link) - -/* - * Shamelessly stolen from the mm implementation of page reference checking, - * see commit f958d7b528b1 for details. - */ -#define req_ref_zero_or_close_to_overflow(req) \ - ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) - -static inline bool req_ref_inc_not_zero(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - return atomic_inc_not_zero(&req->refs); -} - -static inline bool req_ref_put_and_test(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_REFCOUNT))) - return true; - - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_dec_and_test(&req->refs); -} - -static inline void req_ref_get(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - atomic_inc(&req->refs); -} - -static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) -{ - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) - __io_submit_flush_completions(ctx); -} - -static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) -{ - if (!(req->flags & REQ_F_REFCOUNT)) { - req->flags |= REQ_F_REFCOUNT; - atomic_set(&req->refs, nr); - } -} - -static inline void io_req_set_refcount(struct io_kiocb *req) -{ - __io_req_set_refcount(req, 1); -} - -#define IO_RSRC_REF_BATCH 100 - -static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) -{ - percpu_ref_put_many(&node->refs, nr); -} - -static inline void io_req_put_rsrc_locked(struct io_kiocb *req, - struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_rsrc_node *node = req->rsrc_node; - - if (node) { - if (node == ctx->rsrc_node) - ctx->rsrc_cached_refs++; - else - io_rsrc_put_node(node, 1); - } -} - -static inline void io_req_put_rsrc(struct io_kiocb *req) -{ - if (req->rsrc_node) - io_rsrc_put_node(req->rsrc_node, 1); -} - -static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; - } -} - -static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); -} - -static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned int issue_flags) -{ - if (!req->rsrc_node) { - req->rsrc_node = ctx->rsrc_node; - - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - lockdep_assert_held(&ctx->uring_lock); - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); - } else { - percpu_ref_get(&req->rsrc_node->refs); - } - } -} - -static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) -{ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) - req->buf_list->head++; - req->flags &= ~REQ_F_BUFFER_RING; - } else { - list_add(&req->kbuf->list, list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - } - - return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); -} - -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) -{ - lockdep_assert_held(&req->ctx->completion_lock); - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - return __io_put_kbuf(req, &req->ctx->io_buffers_comp); -} - -static inline unsigned int io_put_kbuf(struct io_kiocb *req, - unsigned issue_flags) -{ - unsigned int cflags; - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); - } - - return cflags; -} - -static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - if (ctx->io_bl && bgid < BGID_ARRAY) - return &ctx->io_bl[bgid]; - - return xa_load(&ctx->io_bl_xa, bgid); -} - -static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - struct io_buffer *buf; - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - /* don't recycle if we already did IO to this buffer */ - if (req->flags & REQ_F_PARTIAL_IO) - return; - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - */ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - return; - } - - io_ring_submit_lock(ctx, issue_flags); - - buf = req->kbuf; - bl = io_buffer_get_list(ctx, buf->bgid); - list_add(&buf->list, &bl->buf_list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; - - io_ring_submit_unlock(ctx, issue_flags); -} - -static bool io_match_task(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_kiocb *req; - - if (task && head->task != task) - return false; - if (cancel_all) - return true; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -static bool io_match_linked(struct io_kiocb *head) -{ - struct io_kiocb *req; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -/* - * As io_match_task() but protected against racing with linked timeouts. - * User must not hold timeout_lock. - */ -static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) -{ - bool matched; - - if (task && head->task != task) - return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; -} - -static inline bool req_has_async_data(struct io_kiocb *req) -{ - return req->flags & REQ_F_ASYNC_DATA; -} - -static inline void req_set_fail(struct io_kiocb *req) -{ - req->flags |= REQ_F_FAIL; - if (req->flags & REQ_F_CQE_SKIP) { - req->flags &= ~REQ_F_CQE_SKIP; - req->flags |= REQ_F_SKIP_LINK_CQES; - } -} - -static inline void req_fail_link_node(struct io_kiocb *req, int res) -{ - req_set_fail(req); - req->cqe.res = res; -} - -static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); -} - -static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) -{ - struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); - - complete(&ctx->ref_comp); -} - -static inline bool io_is_timeout_noseq(struct io_kiocb *req) -{ - return !req->timeout.off; -} - -static __cold void io_fallback_req_func(struct work_struct *work) -{ - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - fallback_work.work); - struct llist_node *node = llist_del_all(&ctx->fallback_llist); - struct io_kiocb *req, *tmp; - bool locked = false; - - percpu_ref_get(&ctx->refs); - llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) - req->io_task_work.func(req, &locked); - - if (locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - } - percpu_ref_put(&ctx->refs); -} - -static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) -{ - struct io_ring_ctx *ctx; - int hash_bits; - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return NULL; - - xa_init(&ctx->io_bl_xa); - - /* - * Use 5 bits less than the max cq entries, that should give us around - * 32 entries per hash list if totally full and uniformly spread. - */ - hash_bits = ilog2(p->cq_entries); - hash_bits -= 5; - if (hash_bits <= 0) - hash_bits = 1; - ctx->cancel_hash_bits = hash_bits; - ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), - GFP_KERNEL); - if (!ctx->cancel_hash) - goto err; - __hash_init(ctx->cancel_hash, 1U << hash_bits); - - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); - if (!ctx->dummy_ubuf) - goto err; - /* set invalid range, so io_import_fixed() fails meeting it */ - ctx->dummy_ubuf->ubuf = -1UL; - - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) - goto err; - - ctx->flags = p->flags; - init_waitqueue_head(&ctx->sqo_sq_wait); - INIT_LIST_HEAD(&ctx->sqd_list); - INIT_LIST_HEAD(&ctx->cq_overflow_list); - INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_LIST_HEAD(&ctx->apoll_cache); - init_completion(&ctx->ref_comp); - xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); - mutex_init(&ctx->uring_lock); - init_waitqueue_head(&ctx->cq_wait); - spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); - INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_pages); - INIT_LIST_HEAD(&ctx->io_buffers_comp); - INIT_LIST_HEAD(&ctx->defer_list); - INIT_LIST_HEAD(&ctx->timeout_list); - INIT_LIST_HEAD(&ctx->ltimeout_list); - spin_lock_init(&ctx->rsrc_ref_lock); - INIT_LIST_HEAD(&ctx->rsrc_ref_list); - INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); - init_llist_head(&ctx->rsrc_put_llist); - INIT_LIST_HEAD(&ctx->tctx_list); - ctx->submit_state.free_list.next = NULL; - INIT_WQ_LIST(&ctx->locked_free_list); - INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); - INIT_WQ_LIST(&ctx->submit_state.compl_reqs); - return ctx; -err: - kfree(ctx->dummy_ubuf); - kfree(ctx->cancel_hash); - kfree(ctx->io_bl); - xa_destroy(&ctx->io_bl_xa); - kfree(ctx); - return NULL; -} - -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - -static inline bool io_req_ffs_set(struct io_kiocb *req) -{ - return req->flags & REQ_F_FIXED_FILE; -} - -static inline void io_req_track_inflight(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_INFLIGHT)) { - req->flags |= REQ_F_INFLIGHT; - atomic_inc(¤t->io_uring->inflight_tracked); - } -} - -static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) -{ - if (WARN_ON_ONCE(!req->link)) - return NULL; - - req->flags &= ~REQ_F_ARM_LTIMEOUT; - req->flags |= REQ_F_LINK_TIMEOUT; - - /* linked timeouts should have two refs once prep'ed */ - io_req_set_refcount(req); - __io_req_set_refcount(req->link, 2); - return req->link; -} - -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - -static void io_prep_async_work(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; - - if (!(req->flags & REQ_F_CREDS)) { - req->flags |= REQ_F_CREDS; - req->creds = get_current_cred(); - } - - req->work.list.next = NULL; - req->work.flags = 0; - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; - - if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) - io_wq_hash_work(&req->work, file_inode(req->file)); - } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { - if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; - } -} - -static void io_prep_async_link(struct io_kiocb *req) -{ - struct io_kiocb *cur; - - if (req->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - io_for_each_link(cur, req) - io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); - } else { - io_for_each_link(cur, req) - io_prep_async_work(cur); - } -} - -static inline void io_req_add_compl_list(struct io_kiocb *req) -{ - struct io_submit_state *state = &req->ctx->submit_state; - - if (!(req->flags & REQ_F_CQE_SKIP)) - state->flush_cqes = true; - wq_list_add_tail(&req->comp_list, &state->compl_reqs); -} - -static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) -{ - struct io_kiocb *link = io_prep_linked_timeout(req); - struct io_uring_task *tctx = req->task->io_uring; - - BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); - - /* init ->work of the whole link before punting */ - io_prep_async_link(req); - - /* - * Not expected to happen, but if we do have a bug where this _can_ - * happen, catch it here and ensure the request is marked as - * canceled. That will make io-wq go through the usual work cancel - * procedure rather than attempt to run this request (or create a new - * worker for it). - */ - if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; - - trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, - req->opcode, req->flags, &req->work, - io_wq_is_hashed(&req->work)); - io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); -} - -static void io_kill_timeout(struct io_kiocb *req, int status) - __must_hold(&req->ctx->completion_lock) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_timeout_data *io = req->async_data; - - if (hrtimer_try_to_cancel(&io->timer) != -1) { - if (status) - req_set_fail(req); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&req->timeout.list); - io_req_tw_post_queue(req, status, 0); - } -} - -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) -{ - while (!list_empty(&ctx->defer_list)) { - struct io_defer_entry *de = list_first_entry(&ctx->defer_list, - struct io_defer_entry, list); - - if (req_need_defer(de->req, de->seq)) - break; - list_del_init(&de->list); - io_req_task_queue(de->req); - kfree(de); - } -} - -static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) - __must_hold(&ctx->completion_lock) -{ - u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - struct io_kiocb *req, *tmp; - - spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - u32 events_needed, events_got; - - if (io_is_timeout_noseq(req)) - break; - - /* - * Since seq can easily wrap around over time, subtract - * the last seq at which timeouts were flushed before comparing. - * Assuming not more than 2^31-1 events have happened since, - * these subtractions won't have wrapped, so we can check if - * target is in [last_seq, current_seq] by comparing the two. - */ - events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; - events_got = seq - ctx->cq_last_tm_flush; - if (events_got < events_needed) - break; - - io_kill_timeout(req, 0); - } - ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); -} - -static inline void io_commit_cqring(struct io_ring_ctx *ctx) -{ - /* order cqe stores with ring update */ - smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); -} - -static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ - if (ctx->off_timeout_used || ctx->drain_active) { - spin_lock(&ctx->completion_lock); - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - } - if (ctx->has_evfd) - io_eventfd_signal(ctx); -} - -static inline bool io_sqring_full(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; -} - -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) -{ - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); -} - -/* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ -static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int shift = 0; - unsigned int free, queued, len; - - if (ctx->flags & IORING_SETUP_CQE32) - shift = 1; - - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; - /* we need a contiguous range, limit based on the current array offset */ - len = min(free, ctx->cq_entries - off); - if (!len) - return NULL; - - ctx->cached_cq_tail++; - ctx->cqe_cached = &rings->cqes[off]; - ctx->cqe_sentinel = ctx->cqe_cached + len; - ctx->cqe_cached++; - return &rings->cqes[off << shift]; -} - -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) -{ - if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { - struct io_uring_cqe *cqe = ctx->cqe_cached; - - if (ctx->flags & IORING_SETUP_CQE32) { - unsigned int off = ctx->cqe_cached - ctx->rings->cqes; - - cqe += off; - } - - ctx->cached_cq_tail++; - ctx->cqe_cached++; - return cqe; - } - - return __io_get_cqe(ctx); -} - -static void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - rcu_read_lock(); - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; - - if (!ev_fd->eventfd_async || io_wq_current_is_worker()) - eventfd_signal(ev_fd->cq_ev_fd, 1); -out: - rcu_read_unlock(); -} - -static inline void io_cqring_wake(struct io_ring_ctx *ctx) -{ - /* - * wake_up_all() may seem excessive, but io_wake_function() and - * io_should_wake() handle the termination of the loop and only - * wake as many waiters as we need to. - */ - if (wq_has_sleeper(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); -} - -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ -static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - - io_cqring_wake(ctx); -} - -static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) -{ - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - - if (ctx->flags & IORING_SETUP_SQPOLL) - io_cqring_wake(ctx); -} - -/* Returns true if there are no backlogged entries after the flush */ -static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) -{ - bool all_flushed, posted; - size_t cqe_size = sizeof(struct io_uring_cqe); - - if (!force && __io_cqring_events(ctx) == ctx->cq_entries) - return false; - - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - - posted = false; - spin_lock(&ctx->completion_lock); - while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqe(ctx); - struct io_overflow_cqe *ocqe; - - if (!cqe && !force) - break; - ocqe = list_first_entry(&ctx->cq_overflow_list, - struct io_overflow_cqe, list); - if (cqe) - memcpy(cqe, &ocqe->cqe, cqe_size); - else - io_account_cq_overflow(ctx); - - posted = true; - list_del(&ocqe->list); - kfree(ocqe); - } - - all_flushed = list_empty(&ctx->cq_overflow_list); - if (all_flushed) { - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); - } - - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); - return all_flushed; -} - -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) -{ - bool ret = true; - - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - ret = __io_cqring_overflow_flush(ctx, false); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); - } - - return ret; -} - -static void __io_put_task(struct task_struct *task, int nr) -{ - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); -} - -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - if (likely(task == current)) - task->io_uring->cached_refs += nr; - else - __io_put_task(task, nr); -} - -static void io_task_refs_refill(struct io_uring_task *tctx) -{ - unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; - - percpu_counter_add(&tctx->inflight, refill); - refcount_add(refill, ¤t->usage); - tctx->cached_refs += refill; -} - -static inline void io_get_task_refs(int nr) -{ - struct io_uring_task *tctx = current->io_uring; - - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) - io_task_refs_refill(tctx); -} - -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) -{ - struct io_uring_task *tctx = task->io_uring; - unsigned int refs = tctx->cached_refs; - - if (refs) { - tctx->cached_refs = 0; - percpu_counter_sub(&tctx->inflight, refs); - put_task_struct_many(task, refs); - } -} - -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, - u64 extra2) -{ - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); - if (!ocqe) { - /* - * If we're in ring overflow flush mode, or in task cancel mode, - * or cannot allocate an overflow entry, then we need to drop it - * on the floor. - */ - io_account_cq_overflow(ctx); - set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); - return false; - } - if (list_empty(&ctx->cq_overflow_list)) { - set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); - - } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } - list_add_tail(&ocqe->list, &ctx->cq_overflow_list); - return true; -} - -static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) -{ - struct io_uring_cqe *cqe; - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - WRITE_ONCE(cqe->user_data, user_data); - WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, cflags); - return true; - } - return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); -} - -static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_uring_cqe *cqe; - - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(*cqe)); - return true; - } - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); -} - -static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_uring_cqe *cqe; - u64 extra1 = req->extra1; - u64 extra2 = req->extra2; - - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, extra1, extra2); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); - cqe->big_cqe[0] = extra1; - cqe->big_cqe[1] = extra2; - return true; - } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res, - req->cqe.flags, extra1, extra2); -} - -static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) -{ - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0); - return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); -} - -static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags, - u64 extra1, u64 extra2) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_uring_cqe *cqe; - - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) - return; - if (req->flags & REQ_F_CQE_SKIP) - return; - - trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags, - extra1, extra2); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - WRITE_ONCE(cqe->user_data, req->cqe.user_data); - WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, cflags); - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); - return; - } - - io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2); -} - -static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) -{ - ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - return __io_fill_cqe(ctx, user_data, res, cflags); -} - -static void __io_req_complete_put(struct io_kiocb *req) -{ - /* - * If we're the last reference to this request, add to our locked - * free_list cache. - */ - if (req_ref_put_and_test(req)) { - struct io_ring_ctx *ctx = req->ctx; - - if (req->flags & IO_REQ_LINK_FLAGS) { - if (req->flags & IO_DISARM_MASK) - io_disarm_next(req); - if (req->link) { - io_req_task_queue(req->link); - req->link = NULL; - } - } - io_req_put_rsrc(req); - /* - * Selected buffer deallocation in io_clean_op() assumes that - * we don't hold ->completion_lock. Clean them here to avoid - * deadlocks. - */ - io_put_kbuf_comp(req); - io_dismantle_req(req); - io_put_task(req->task, 1); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - } -} - -static void __io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) -{ - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); - __io_req_complete_put(req); -} - -static void __io_req_complete_post32(struct io_kiocb *req, s32 res, - u32 cflags, u64 extra1, u64 extra2) -{ - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe32_req(req, res, cflags, extra1, extra2); - __io_req_complete_put(req); -} - -static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - __io_req_complete_post(req, res, cflags); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static void io_req_complete_post32(struct io_kiocb *req, s32 res, - u32 cflags, u64 extra1, u64 extra2) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - __io_req_complete_post32(req, res, cflags, extra1, extra2); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static inline void io_req_complete_state(struct io_kiocb *req, s32 res, - u32 cflags) -{ - req->cqe.res = res; - req->cqe.flags = cflags; - req->flags |= REQ_F_COMPLETE_INLINE; -} - -static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - s32 res, u32 cflags) -{ - if (issue_flags & IO_URING_F_COMPLETE_DEFER) - io_req_complete_state(req, res, cflags); - else - io_req_complete_post(req, res, cflags); -} - -static inline void __io_req_complete32(struct io_kiocb *req, - unsigned int issue_flags, s32 res, - u32 cflags, u64 extra1, u64 extra2) -{ - if (issue_flags & IO_URING_F_COMPLETE_DEFER) { - io_req_complete_state(req, res, cflags); - req->extra1 = extra1; - req->extra2 = extra2; - } else { - io_req_complete_post32(req, res, cflags, extra1, extra2); - } -} - -static inline void io_req_complete(struct io_kiocb *req, s32 res) -{ - if (res < 0) - req_set_fail(req); - __io_req_complete(req, 0, res, 0); -} - -static void io_req_complete_failed(struct io_kiocb *req, s32 res) -{ - req_set_fail(req); - io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); -} - -/* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - req->cqe.res = 0; -} - -static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_submit_state *state) -{ - spin_lock(&ctx->completion_lock); - wq_list_splice(&ctx->locked_free_list, &state->free_list); - ctx->locked_free_nr = 0; - spin_unlock(&ctx->completion_lock); -} - -static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) -{ - return !ctx->submit_state.free_list.next; -} - -/* - * A request might get retired back into the request caches even before opcode - * handlers and io_issue_sqe() are done with it, e.g. inline completion path. - * Because of that, io_alloc_req() should be called only under ->uring_lock - * and with extra caution to not get a request that is still worked on. - */ -static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - void *reqs[IO_REQ_ALLOC_BATCH]; - int ret, i; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); - if (!io_req_cache_empty(ctx)) - return true; - } - - ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); - - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - if (unlikely(ret <= 0)) { - reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!reqs[0]) - return false; - ret = 1; - } - - percpu_ref_get_many(&ctx->refs, ret); - for (i = 0; i < ret; i++) { - struct io_kiocb *req = reqs[i]; - - io_preinit_req(req, ctx); - io_req_add_to_cache(req, ctx); - } - return true; -} - -static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) -{ - if (unlikely(io_req_cache_empty(ctx))) - return __io_alloc_req_refill(ctx); - return true; -} - -static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) -{ - struct io_wq_work_node *node; - - node = wq_stack_extract(&ctx->submit_state.free_list); - return container_of(node, struct io_kiocb, comp_list); -} - -static inline void io_put_file(struct file *file) -{ - if (file) - fput(file); -} - -static inline void io_dismantle_req(struct io_kiocb *req) -{ - unsigned int flags = req->flags; - - if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - if (!(flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); -} - -static __cold void io_free_req(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - io_req_put_rsrc(req); - io_dismantle_req(req); - io_put_task(req->task, 1); - - spin_lock(&ctx->completion_lock); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - spin_unlock(&ctx->completion_lock); -} - -static inline void io_remove_next_linked(struct io_kiocb *req) -{ - struct io_kiocb *nxt = req->link; - - req->link = nxt->link; - nxt->link = NULL; -} - -static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { - struct io_timeout_data *io = link->async_data; - - io_remove_next_linked(req); - link->timeout.head = NULL; - if (hrtimer_try_to_cancel(&io->timer) != -1) { - list_del(&link->timeout.list); - return link; - } - } - return NULL; -} - -static void io_fail_links(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_kiocb *nxt, *link = req->link; - bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; - - req->link = NULL; - while (link) { - long res = -ECANCELED; - - if (link->flags & REQ_F_FAIL) - res = link->cqe.res; - - nxt = link->link; - link->link = NULL; - - trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, - req->opcode, link); - - if (ignore_cqes) - link->flags |= REQ_F_CQE_SKIP; - else - link->flags &= ~REQ_F_CQE_SKIP; - __io_req_complete_post(link, res, 0); - link = nxt; - } -} - -static bool io_disarm_next(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_kiocb *link = NULL; - bool posted = false; - - if (req->flags & REQ_F_ARM_LTIMEOUT) { - link = req->link; - req->flags &= ~REQ_F_ARM_LTIMEOUT; - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { - io_remove_next_linked(req); - io_req_tw_post_queue(link, -ECANCELED, 0); - posted = true; - } - } else if (req->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); - if (link) { - posted = true; - io_req_tw_post_queue(link, -ECANCELED, 0); - } - } - if (unlikely((req->flags & REQ_F_FAIL) && - !(req->flags & REQ_F_HARDLINK))) { - posted |= (req->link != NULL); - io_fail_links(req); - } - return posted; -} - -static void __io_req_find_next_prep(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - bool posted; - - spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); -} - -static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt; - - /* - * If LINK is set, we have dependent requests in this chain. If we - * didn't fail this request, queue the first one up, moving any other - * dependencies to the next request. In case of failure, fail the rest - * of the chain. - */ - if (unlikely(req->flags & IO_DISARM_MASK)) - __io_req_find_next_prep(req); - nxt = req->link; - req->link = NULL; - return nxt; -} - -static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) -{ - if (!ctx) - return; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (*locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - *locked = false; - } - percpu_ref_put(&ctx->refs); -} - -static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) -{ - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static void handle_prev_tw_list(struct io_wq_work_node *node, - struct io_ring_ctx **ctx, bool *uring_locked) -{ - if (*ctx && !*uring_locked) - spin_lock(&(*ctx)->completion_lock); - - do { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - if (unlikely(!*uring_locked && *ctx)) - ctx_commit_and_unlock(*ctx); - - ctx_flush_and_put(*ctx, uring_locked); - *ctx = req->ctx; - /* if not contended, grab and improve batching */ - *uring_locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); - if (unlikely(!*uring_locked)) - spin_lock(&(*ctx)->completion_lock); - } - if (likely(*uring_locked)) - req->io_task_work.func(req, uring_locked); - else - __io_req_complete_post(req, req->cqe.res, - io_put_kbuf_comp(req)); - node = next; - } while (node); - - if (unlikely(!*uring_locked)) - ctx_commit_and_unlock(*ctx); -} - -static void handle_tw_list(struct io_wq_work_node *node, - struct io_ring_ctx **ctx, bool *locked) -{ - do { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, locked); - *ctx = req->ctx; - /* if not contended, grab and improve batching */ - *locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); - } - req->io_task_work.func(req, locked); - node = next; - } while (node); -} - -static void tctx_task_work(struct callback_head *cb) -{ - bool uring_locked = false; - struct io_ring_ctx *ctx = NULL; - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, - task_work); - - while (1) { - struct io_wq_work_node *node1, *node2; - - spin_lock_irq(&tctx->task_lock); - node1 = tctx->prio_task_list.first; - node2 = tctx->task_list.first; - INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); - if (!node2 && !node1) - tctx->task_running = false; - spin_unlock_irq(&tctx->task_lock); - if (!node2 && !node1) - break; - - if (node1) - handle_prev_tw_list(node1, &ctx, &uring_locked); - if (node2) - handle_tw_list(node2, &ctx, &uring_locked); - cond_resched(); - - if (data_race(!tctx->task_list.first) && - data_race(!tctx->prio_task_list.first) && uring_locked) - io_submit_flush_completions(ctx); - } - - ctx_flush_and_put(ctx, &uring_locked); - - /* relaxed read is enough as only the task itself sets ->in_idle */ - if (unlikely(atomic_read(&tctx->in_idle))) - io_uring_drop_tctx_refs(current); -} - -static void __io_req_task_work_add(struct io_kiocb *req, - struct io_uring_task *tctx, - struct io_wq_work_list *list) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_wq_work_node *node; - unsigned long flags; - bool running; - - spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_add_tail(&req->io_task_work.node, list); - running = tctx->task_running; - if (!running) - tctx->task_running = true; - spin_unlock_irqrestore(&tctx->task_lock, flags); - - /* task_work already pending, we're done */ - if (running) - return; - - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) - return; - - spin_lock_irqsave(&tctx->task_lock, flags); - tctx->task_running = false; - node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); - spin_unlock_irqrestore(&tctx->task_lock, flags); - - while (node) { - req = container_of(node, struct io_kiocb, io_task_work.node); - node = node->next; - if (llist_add(&req->io_task_work.fallback_node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); - } -} - -static void io_req_task_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->task->io_uring; - - __io_req_task_work_add(req, tctx, &tctx->task_list); -} - -static void io_req_task_prio_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->task->io_uring; - - if (req->ctx->flags & IORING_SETUP_SQPOLL) - __io_req_task_work_add(req, tctx, &tctx->prio_task_list); - else - __io_req_task_work_add(req, tctx, &tctx->task_list); -} - -static void io_req_tw_post(struct io_kiocb *req, bool *locked) -{ - io_req_complete_post(req, req->cqe.res, req->cqe.flags); -} - -static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) -{ - req->cqe.res = res; - req->cqe.flags = cflags; - req->io_task_work.func = io_req_tw_post; - io_req_task_work_add(req); -} - -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) -{ - /* not needed for normal modes, but SQPOLL depends on it */ - io_tw_lock(req->ctx, locked); - io_req_complete_failed(req, req->cqe.res); -} - -static void io_req_task_submit(struct io_kiocb *req, bool *locked) -{ - io_tw_lock(req->ctx, locked); - /* req->task == current here, checking PF_EXITING is safe */ - if (likely(!(req->task->flags & PF_EXITING))) - io_queue_sqe(req); - else - io_req_complete_failed(req, -EFAULT); -} - -static void io_req_task_queue_fail(struct io_kiocb *req, int ret) -{ - req->cqe.res = ret; - req->io_task_work.func = io_req_task_cancel; - io_req_task_work_add(req); -} - -static void io_req_task_queue(struct io_kiocb *req) -{ - req->io_task_work.func = io_req_task_submit; - io_req_task_work_add(req); -} - -static void io_req_task_queue_reissue(struct io_kiocb *req) -{ - req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req); -} - -static void io_queue_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt = io_req_find_next(req); - - if (nxt) - io_req_task_queue(nxt); -} - -static void io_free_batch_list(struct io_ring_ctx *ctx, - struct io_wq_work_node *node) - __must_hold(&ctx->uring_lock) -{ - struct task_struct *task = NULL; - int task_refs = 0; - - do { - struct io_kiocb *req = container_of(node, struct io_kiocb, - comp_list); - - if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { - if (req->flags & REQ_F_REFCOUNT) { - node = req->comp_list.next; - if (!req_ref_put_and_test(req)) - continue; - } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - struct async_poll *apoll = req->apoll; - - if (apoll->double_poll) - kfree(apoll->double_poll); - list_add(&apoll->poll.wait.entry, - &ctx->apoll_cache); - req->flags &= ~REQ_F_POLLED; - } - if (req->flags & IO_REQ_LINK_FLAGS) - io_queue_next(req); - if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - } - if (!(req->flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); - - io_req_put_rsrc_locked(req, ctx); - - if (req->task != task) { - if (task) - io_put_task(task, task_refs); - task = req->task; - task_refs = 0; - } - task_refs++; - node = req->comp_list.next; - io_req_add_to_cache(req, ctx); - } while (node); - - if (task) - io_put_task(task, task_refs); -} - -static void __io_submit_flush_completions(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_wq_work_node *node, *prev; - struct io_submit_state *state = &ctx->submit_state; - - if (state->flush_cqes) { - spin_lock(&ctx->completion_lock); - wq_list_for_each(node, prev, &state->compl_reqs) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - comp_list); - - if (!(req->flags & REQ_F_CQE_SKIP)) { - if (!(ctx->flags & IORING_SETUP_CQE32)) - __io_fill_cqe_req_filled(ctx, req); - else - __io_fill_cqe32_req_filled(ctx, req); - } - } - - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - state->flush_cqes = false; - } - - io_free_batch_list(ctx, state->compl_reqs.first); - INIT_WQ_LIST(&state->compl_reqs); -} - -/* - * Drop reference to request, return next in chain (if there is one) if this - * was the last reference to this request. - */ -static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt = NULL; - - if (req_ref_put_and_test(req)) { - if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) - nxt = io_req_find_next(req); - io_free_req(req); - } - return nxt; -} - -static inline void io_put_req(struct io_kiocb *req) -{ - if (req_ref_put_and_test(req)) { - io_queue_next(req); - io_free_req(req); - } -} - -static unsigned io_cqring_events(struct io_ring_ctx *ctx) -{ - /* See comment at the top of this file */ - smp_rmb(); - return __io_cqring_events(ctx); -} - -static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - - /* make sure SQ entry isn't read before tail */ - return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; -} - -static inline bool io_run_task_work(void) -{ - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { - __set_current_state(TASK_RUNNING); - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - return true; - } - - return false; -} - -static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) -{ - struct io_wq_work_node *pos, *start, *prev; - unsigned int poll_flags = BLK_POLL_NOSLEEP; - DEFINE_IO_COMP_BATCH(iob); - int nr_events = 0; - - /* - * Only spin for completions if we don't have multiple devices hanging - * off our complete list. - */ - if (ctx->poll_multi_queue || force_nonspin) - poll_flags |= BLK_POLL_ONESHOT; - - wq_list_for_each(pos, start, &ctx->iopoll_list) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct kiocb *kiocb = &req->rw.kiocb; - int ret; - - /* - * Move completed and retryable entries to our local lists. - * If we find a request that requires polling, break out - * and complete those lists first, if we have entries there. - */ - if (READ_ONCE(req->iopoll_completed)) - break; - - ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); - if (unlikely(ret < 0)) - return ret; - else if (ret) - poll_flags |= BLK_POLL_ONESHOT; - - /* iopoll may have completed current req */ - if (!rq_list_empty(iob.req_list) || - READ_ONCE(req->iopoll_completed)) - break; - } - - if (!rq_list_empty(iob.req_list)) - iob.complete(&iob); - else if (!pos) - return 0; - - prev = start; - wq_list_for_each_resume(pos, prev) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - - /* order with io_complete_rw_iopoll(), e.g. ->result updates */ - if (!smp_load_acquire(&req->iopoll_completed)) - break; - nr_events++; - if (unlikely(req->flags & REQ_F_CQE_SKIP)) - continue; - __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0)); - } - - if (unlikely(!nr_events)) - return 0; - - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); - pos = start ? start->next : ctx->iopoll_list.first; - wq_list_cut(&ctx->iopoll_list, prev, start); - io_free_batch_list(ctx, pos); - return nr_events; -} - -/* - * We can't just wait for polled events to come to us, we have to actively - * find and complete them. - */ -static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) -{ - if (!(ctx->flags & IORING_SETUP_IOPOLL)) - return; - - mutex_lock(&ctx->uring_lock); - while (!wq_list_empty(&ctx->iopoll_list)) { - /* let it sleep and repeat later if can't complete a request */ - if (io_do_iopoll(ctx, true) == 0) - break; - /* - * Ensure we allow local-to-the-cpu processing to take place, - * in this case we need to ensure that we reap all events. - * Also let task_work, etc. to progress by releasing the mutex - */ - if (need_resched()) { - mutex_unlock(&ctx->uring_lock); - cond_resched(); - mutex_lock(&ctx->uring_lock); - } - } - mutex_unlock(&ctx->uring_lock); -} - -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) -{ - unsigned int nr_events = 0; - int ret = 0; - unsigned long check_cq; - - /* - * Don't enter poll loop if we already have events pending. - * If we do, we can potentially be spinning for commands that - * already triggered a CQE (eg in error). - */ - check_cq = READ_ONCE(ctx->check_cq); - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx, false); - if (io_cqring_events(ctx)) - return 0; - - /* - * Similarly do not spin if we have not informed the user of any - * dropped CQE. - */ - if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) - return -EBADR; - - do { - /* - * If a submit got punted to a workqueue, we can have the - * application entering polling for a command before it gets - * issued. That app will hold the uring_lock for the duration - * of the poll right here, so we need to take a breather every - * now and then to ensure that the issue has a chance to add - * the poll to the issued list. Otherwise we can spin here - * forever, while the workqueue is stuck trying to acquire the - * very same mutex. - */ - if (wq_list_empty(&ctx->iopoll_list)) { - u32 tail = ctx->cached_cq_tail; - - mutex_unlock(&ctx->uring_lock); - io_run_task_work(); - mutex_lock(&ctx->uring_lock); - - /* some requests don't go through iopoll_list */ - if (tail != ctx->cached_cq_tail || - wq_list_empty(&ctx->iopoll_list)) - break; - } - ret = io_do_iopoll(ctx, !min); - if (ret < 0) - break; - nr_events += ret; - ret = 0; - } while (nr_events < min && !need_resched()); - - return ret; -} - -static void kiocb_end_write(struct io_kiocb *req) -{ - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (req->flags & REQ_F_ISREG) { - struct super_block *sb = file_inode(req->file)->i_sb; - - __sb_writers_acquired(sb, SB_FREEZE_WRITE); - sb_end_write(sb); - } -} - -#ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *rw = req->async_data; - - if (!req_has_async_data(req)) - return !io_req_prep_async(req); - iov_iter_restore(&rw->s.iter, &rw->s.iter_state); - return true; -} - -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - umode_t mode = file_inode(req->file)->i_mode; - struct io_ring_ctx *ctx = req->ctx; - - if (!S_ISBLK(mode) && !S_ISREG(mode)) - return false; - if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) - return false; - /* - * If ref is dying, we might be running poll reap from the exit work. - * Don't attempt to reissue from that path, just let it fail with - * -EAGAIN. - */ - if (percpu_ref_is_dying(&ctx->refs)) - return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->task, current) || !in_task()) - return false; - return true; -} -#else -static bool io_resubmit_prep(struct io_kiocb *req) -{ - return false; -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - return false; -} -#endif - -static bool __io_complete_rw_common(struct io_kiocb *req, long res) -{ - if (req->rw.kiocb.ki_flags & IOCB_WRITE) { - kiocb_end_write(req); - fsnotify_modify(req->file); - } else { - fsnotify_access(req->file); - } - if (unlikely(res != req->cqe.res)) { - if ((res == -EAGAIN || res == -EOPNOTSUPP) && - io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE; - return true; - } - req_set_fail(req); - req->cqe.res = res; - } - return false; -} - -static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) -{ - int res = req->cqe.res; - - if (*locked) { - io_req_complete_state(req, res, io_put_kbuf(req, 0)); - io_req_add_compl_list(req); - } else { - io_req_complete_post(req, res, - io_put_kbuf(req, IO_URING_F_UNLOCKED)); - } -} - -static void __io_complete_rw(struct io_kiocb *req, long res, - unsigned int issue_flags) -{ - if (__io_complete_rw_common(req, res)) - return; - __io_req_complete(req, issue_flags, req->cqe.res, - io_put_kbuf(req, issue_flags)); -} - -static void io_complete_rw(struct kiocb *kiocb, long res) -{ - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - - if (__io_complete_rw_common(req, res)) - return; - req->cqe.res = res; - req->io_task_work.func = io_req_task_complete; - io_req_task_prio_work_add(req); -} - -static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) -{ - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - - if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE; - return; - } - req->cqe.res = res; - } - - /* order with io_iopoll_complete() checking ->iopoll_completed */ - smp_store_release(&req->iopoll_completed, 1); -} - -/* - * After the iocb has been issued, it's safe to be found on the poll list. - * Adding the kiocb to the list AFTER submission ensures that we don't - * find it from a io_do_iopoll() thread before the issuer is done - * accessing the kiocb cookie. - */ -static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - - /* workqueue context doesn't hold uring_lock, grab it now */ - if (unlikely(needs_lock)) - mutex_lock(&ctx->uring_lock); - - /* - * Track whether we have multiple files in our lists. This will impact - * how we do polling eventually, not spinning if we're on potentially - * different devices. - */ - if (wq_list_empty(&ctx->iopoll_list)) { - ctx->poll_multi_queue = false; - } else if (!ctx->poll_multi_queue) { - struct io_kiocb *list_req; - - list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, - comp_list); - if (list_req->file != req->file) - ctx->poll_multi_queue = true; - } - - /* - * For fast devices, IO may have already completed. If it has, add - * it to the front so we find it first. - */ - if (READ_ONCE(req->iopoll_completed)) - wq_list_add_head(&req->comp_list, &ctx->iopoll_list); - else - wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); - - if (unlikely(needs_lock)) { - /* - * If IORING_SETUP_SQPOLL is enabled, sqes are either handle - * in sq thread task context or in io worker task context. If - * current task context is sq thread, we don't need to check - * whether should wake up sq thread. - */ - if ((ctx->flags & IORING_SETUP_SQPOLL) && - wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); - - mutex_unlock(&ctx->uring_lock); - } -} - -static bool io_bdev_nowait(struct block_device *bdev) -{ - return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static bool __io_file_supports_nowait(struct file *file, umode_t mode) -{ - if (S_ISBLK(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(I_BDEV(file->f_mapping->host))) - return true; - return false; - } - if (S_ISSOCK(mode)) - return true; - if (S_ISREG(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(file->f_inode->i_sb->s_bdev) && - file->f_op != &io_uring_fops) - return true; - return false; - } - - /* any ->read/write should understand O_NONBLOCK */ - if (file->f_flags & O_NONBLOCK) - return true; - return file->f_mode & FMODE_NOWAIT; -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static unsigned int io_file_get_flags(struct file *file) -{ - umode_t mode = file_inode(file)->i_mode; - unsigned int res = 0; - - if (S_ISREG(mode)) - res |= FFS_ISREG; - if (__io_file_supports_nowait(file, mode)) - res |= FFS_NOWAIT; - if (io_file_need_scm(file)) - res |= FFS_SCM; - return res; -} - -static inline bool io_file_supports_nowait(struct io_kiocb *req) -{ - return req->flags & REQ_F_SUPPORT_NOWAIT; -} - -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct kiocb *kiocb = &req->rw.kiocb; - unsigned ioprio; - int ret; - - kiocb->ki_pos = READ_ONCE(sqe->off); - - ioprio = READ_ONCE(sqe->ioprio); - if (ioprio) { - ret = ioprio_check_cap(ioprio); - if (ret) - return ret; - - kiocb->ki_ioprio = ioprio; - } else { - kiocb->ki_ioprio = get_current_ioprio(); - } - - req->imu = NULL; - req->rw.addr = READ_ONCE(sqe->addr); - req->rw.len = READ_ONCE(sqe->len); - req->rw.flags = READ_ONCE(sqe->rw_flags); - /* used for fixed read/write too - just read unconditionally */ - req->buf_index = READ_ONCE(sqe->buf_index); - return 0; -} - -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) -{ - switch (ret) { - case -EIOCBQUEUED: - break; - case -ERESTARTSYS: - case -ERESTARTNOINTR: - case -ERESTARTNOHAND: - case -ERESTART_RESTARTBLOCK: - /* - * We can't just restart the syscall, since previously - * submitted sqes may already be in progress. Just fail this - * IO with EINTR. - */ - ret = -EINTR; - fallthrough; - default: - kiocb->ki_complete(kiocb, ret); - } -} - -static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) -{ - struct kiocb *kiocb = &req->rw.kiocb; - - if (kiocb->ki_pos != -1) - return &kiocb->ki_pos; - - if (!(req->file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = req->file->f_pos; - return &kiocb->ki_pos; - } - - kiocb->ki_pos = 0; - return NULL; -} - -static void kiocb_done(struct io_kiocb *req, ssize_t ret, - unsigned int issue_flags) -{ - struct io_async_rw *io = req->async_data; - - /* add previously done IO, if any */ - if (req_has_async_data(req) && io->bytes_done > 0) { - if (ret < 0) - ret = io->bytes_done; - else - ret += io->bytes_done; - } - - if (req->flags & REQ_F_CUR_POS) - req->file->f_pos = req->rw.kiocb.ki_pos; - if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, issue_flags); - else - io_rw_done(&req->rw.kiocb, ret); - - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) - io_req_task_queue_reissue(req); - else - io_req_task_queue_fail(req, ret); - } -} - -static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - struct io_mapped_ubuf *imu) -{ - size_t len = req->rw.len; - u64 buf_end, buf_addr = req->rw.addr; - size_t offset; - - if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) - return -EFAULT; - /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) - return -EFAULT; - - /* - * May not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ - offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. - */ - const struct bio_vec *bvec = imu->bvec; - - if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; - - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); - - iter->bvec = bvec + seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; - } - } - - return 0; -} - -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - unsigned int issue_flags) -{ - struct io_mapped_ubuf *imu = req->imu; - u16 index, buf_index = req->buf_index; - - if (likely(!imu)) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - io_req_set_rsrc_node(req, ctx, issue_flags); - index = array_index_nospec(buf_index, ctx->nr_user_bufs); - imu = READ_ONCE(ctx->user_bufs[index]); - req->imu = imu; - } - return __io_import_fixed(req, rw, iter, imu); -} - -static int io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) -{ - bl->bgid = bgid; - if (bgid < BGID_ARRAY) - return 0; - - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); -} - -static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl) -{ - if (!list_empty(&bl->buf_list)) { - struct io_buffer *kbuf; - - kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&kbuf->list); - if (*len > kbuf->len) - *len = kbuf->len; - req->flags |= REQ_F_BUFFER_SELECTED; - req->kbuf = kbuf; - req->buf_index = kbuf->bid; - return u64_to_user_ptr(kbuf->addr); - } - return NULL; -} - -static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl, - unsigned int issue_flags) -{ - struct io_uring_buf_ring *br = bl->buf_ring; - struct io_uring_buf *buf; - __u32 head = bl->head; - - if (unlikely(smp_load_acquire(&br->tail) == head)) { - io_ring_submit_unlock(req->ctx, issue_flags); - return NULL; - } - - head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1; - buf = page_address(bl->buf_pages[index]); - buf += off; - } - if (*len > buf->len) - *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; - req->buf_index = buf->bid; - - if (issue_flags & IO_URING_F_UNLOCKED) { - /* - * If we came in unlocked, we have no choice but to consume the - * buffer here. This does mean it'll be pinned until the IO - * completes. But coming in unlocked means we're in io-wq - * context, hence there should be no further retry. For the - * locked case, the caller must ensure to call the commit when - * the transfer completes (or if we get -EAGAIN and must poll - * or retry). - */ - req->buf_list = NULL; - bl->head++; - } - return u64_to_user_ptr(buf->addr); -} - -static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - void __user *ret = NULL; - - io_ring_submit_lock(req->ctx, issue_flags); - - bl = io_buffer_get_list(ctx, req->buf_index); - if (likely(bl)) { - if (bl->buf_nr_pages) - ret = io_ring_buffer_select(req, len, bl, issue_flags); - else - ret = io_provided_buffer_select(req, len, bl); - } - io_ring_submit_unlock(req->ctx, issue_flags); - return ret; -} - -#ifdef CONFIG_COMPAT -static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct compat_iovec __user *uiov; - compat_ssize_t clen; - void __user *buf; - size_t len; - - uiov = u64_to_user_ptr(req->rw.addr); - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - - len = clen; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - req->rw.addr = (unsigned long) buf; - iov[0].iov_base = buf; - req->rw.len = iov[0].iov_len = (compat_size_t) len; - return 0; -} -#endif - -static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); - void __user *buf; - ssize_t len; - - if (copy_from_user(iov, uiov, sizeof(*uiov))) - return -EFAULT; - - len = iov[0].iov_len; - if (len < 0) - return -EINVAL; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - req->rw.addr = (unsigned long) buf; - iov[0].iov_base = buf; - req->rw.len = iov[0].iov_len = len; - return 0; -} - -static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - iov[0].iov_base = u64_to_user_ptr(req->rw.addr); - iov[0].iov_len = req->rw.len; - return 0; - } - if (req->rw.len != 1) - return -EINVAL; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return io_compat_import(req, iov, issue_flags); -#endif - - return __io_iov_buffer_select(req, iov, issue_flags); -} - -static inline bool io_do_buffer_select(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return false; - return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); -} - -static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, - struct io_rw_state *s, - unsigned int issue_flags) -{ - struct iov_iter *iter = &s->iter; - u8 opcode = req->opcode; - struct iovec *iovec; - void __user *buf; - size_t sqe_len; - ssize_t ret; - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, rw, iter, issue_flags); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - buf = u64_to_user_ptr(req->rw.addr); - sqe_len = req->rw.len; - - if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return ERR_PTR(-ENOBUFS); - req->rw.addr = (unsigned long) buf; - req->rw.len = sqe_len; - } - - ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - iovec = s->fast_iov; - if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_iov_buffer_select(req, iovec, issue_flags); - if (ret) - return ERR_PTR(ret); - iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); - return NULL; - } - - ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, - req->ctx->compat); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - return iovec; -} - -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct io_rw_state *s, - unsigned int issue_flags) -{ - *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (unlikely(IS_ERR(*iovec))) - return PTR_ERR(*iovec); - - iov_iter_save_state(&s->iter, &s->iter_state); - return 0; -} - -static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) -{ - return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; -} - -/* - * For files that don't have ->read_iter() and ->write_iter(), handle them - * by looping over ->read() or ->write() manually. - */ -static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) -{ - struct kiocb *kiocb = &req->rw.kiocb; - struct file *file = req->file; - ssize_t ret = 0; - loff_t *ppos; - - /* - * Don't support polled IO through this interface, and we can't - * support non-blocking either. For the latter, this just causes - * the kiocb to be handled from an async context. - */ - if (kiocb->ki_flags & IOCB_HIPRI) - return -EOPNOTSUPP; - if ((kiocb->ki_flags & IOCB_NOWAIT) && - !(kiocb->ki_filp->f_flags & O_NONBLOCK)) - return -EAGAIN; - - ppos = io_kiocb_ppos(kiocb); - - while (iov_iter_count(iter)) { - struct iovec iovec; - ssize_t nr; - - if (!iov_iter_is_bvec(iter)) { - iovec = iov_iter_iovec(iter); - } else { - iovec.iov_base = u64_to_user_ptr(req->rw.addr); - iovec.iov_len = req->rw.len; - } - - if (rw == READ) { - nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, ppos); - } else { - nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, ppos); - } - - if (nr < 0) { - if (!ret) - ret = nr; - break; - } - ret += nr; - if (!iov_iter_is_bvec(iter)) { - iov_iter_advance(iter, nr); - } else { - req->rw.addr += nr; - req->rw.len -= nr; - if (!req->rw.len) - break; - } - if (nr != iovec.iov_len) - break; - } - - return ret; -} - -static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, struct iov_iter *iter) -{ - struct io_async_rw *rw = req->async_data; - - memcpy(&rw->s.iter, iter, sizeof(*iter)); - rw->free_iovec = iovec; - rw->bytes_done = 0; - /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter)) - return; - if (!iovec) { - unsigned iov_off = 0; - - rw->s.iter.iov = rw->s.fast_iov; - if (iter->iov != fast_iov) { - iov_off = iter->iov - fast_iov; - rw->s.iter.iov += iov_off; - } - if (rw->s.fast_iov != fast_iov) - memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, - sizeof(struct iovec) * iter->nr_segs); - } else { - req->flags |= REQ_F_NEED_CLEANUP; - } -} - -static inline bool io_alloc_async_data(struct io_kiocb *req) -{ - WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); - req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - struct io_rw_state *s, bool force) -{ - if (!force && !io_op_defs[req->opcode].needs_async_setup) - return 0; - if (!req_has_async_data(req)) { - struct io_async_rw *iorw; - - if (io_alloc_async_data(req)) { - kfree(iovec); - return -ENOMEM; - } - - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); - iorw = req->async_data; - /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); - } - return 0; -} - -static inline int io_rw_prep_async(struct io_kiocb *req, int rw) -{ - struct io_async_rw *iorw = req->async_data; - struct iovec *iov; - int ret; - - /* submission path, ->uring_lock should already be taken */ - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); - if (unlikely(ret < 0)) - return ret; - - iorw->bytes_done = 0; - iorw->free_iovec = iov; - if (iov) - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_readv_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, READ); -} - -static int io_writev_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, WRITE); -} - -/* - * This is our waitqueue callback handler, registered through __folio_lock_async() - * when we initially tried to do the IO with the iocb armed our waitqueue. - * This gets called when the page is unlocked, and we generally expect that to - * happen when the page IO is completed and the page is now uptodate. This will - * queue a task_work based retry of the operation, attempting to copy the data - * again. If the latter fails because the page was NOT uptodate, then we will - * do a thread based blocking retry of the operation. That's the unexpected - * slow path. - */ -static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, - int sync, void *arg) -{ - struct wait_page_queue *wpq; - struct io_kiocb *req = wait->private; - struct wait_page_key *key = arg; - - wpq = container_of(wait, struct wait_page_queue, wait); - - if (!wake_page_match(wpq, key)) - return 0; - - req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; - list_del_init(&wait->entry); - io_req_task_queue(req); - return 1; -} - -/* - * This controls whether a given IO request should be armed for async page - * based retry. If we return false here, the request is handed to the async - * worker threads for retry. If we're doing buffered reads on a regular file, - * we prepare a private wait_page_queue entry and retry the operation. This - * will either succeed because the page is now uptodate and unlocked, or it - * will register a callback when the page is unlocked at IO completion. Through - * that callback, io_uring uses task_work to setup a retry of the operation. - * That retry will attempt the buffered read again. The retry will generally - * succeed, or in rare cases where it fails, we then fall back to using the - * async worker threads for a blocking retry. - */ -static bool io_rw_should_retry(struct io_kiocb *req) -{ - struct io_async_rw *rw = req->async_data; - struct wait_page_queue *wait = &rw->wpq; - struct kiocb *kiocb = &req->rw.kiocb; - - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) - return false; - - /* Only for buffered IO */ - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) - return false; - - /* - * just use poll if we can, and don't attempt if the fs doesn't - * support callback based unlocks - */ - if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) - return false; - - wait->wait.func = io_async_buf_func; - wait->wait.private = req; - wait->wait.flags = 0; - INIT_LIST_HEAD(&wait->wait.entry); - kiocb->ki_flags |= IOCB_WAITQ; - kiocb->ki_flags &= ~IOCB_NOWAIT; - kiocb->ki_waitq = wait; - return true; -} - -static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) -{ - if (likely(req->file->f_op->read_iter)) - return call_read_iter(req->file, &req->rw.kiocb, iter); - else if (req->file->f_op->read) - return loop_rw_iter(READ, req, iter); - else - return -EINVAL; -} - -static bool need_read_all(struct io_kiocb *req) -{ - return req->flags & REQ_F_ISREG || - S_ISBLK(file_inode(req->file)->i_mode); -} - -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) -{ - struct kiocb *kiocb = &req->rw.kiocb; - struct io_ring_ctx *ctx = req->ctx; - struct file *file = req->file; - int ret; - - if (unlikely(!file || !(file->f_mode & mode))) - return -EBADF; - - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - - kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, req->rw.flags); - if (unlikely(ret)) - return ret; - - /* - * If the file is marked O_NONBLOCK, still allow retry for it if it - * supports async. Otherwise it's impossible to use O_NONBLOCK files - * reliably. If not, or it IOCB_NOWAIT is set, don't retry. - */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) - req->flags |= REQ_F_NOWAIT; - - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) - return -EOPNOTSUPP; - - kiocb->private = NULL; - kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; - kiocb->ki_complete = io_complete_rw_iopoll; - req->iopoll_completed = 0; - } else { - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - kiocb->ki_complete = io_complete_rw; - } - - return 0; -} - -static int io_read(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; - struct kiocb *kiocb = &req->rw.kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *rw; - ssize_t ret, ret2; - loff_t *ppos; - - if (!req_has_async_data(req)) { - ret = io_import_iovec(READ, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - /* - * Safe and required to re-import if we're using provided - * buffers, as we dropped the selected one before retry. - */ - if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_import_iovec(READ, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } - - rw = req->async_data; - s = &rw->s; - /* - * We come here from an earlier attempt, restore our state to - * match in case it doesn't. It's cheap enough that we don't - * need to make this conditional. - */ - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - req->cqe.res = iov_iter_count(&s->iter); - - if (force_nonblock) { - /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) { - ret = io_setup_async_rw(req, iovec, s, true); - return ret ?: -EAGAIN; - } - kiocb->ki_flags |= IOCB_NOWAIT; - } else { - /* Ensure we clear previously set non-block flag */ - kiocb->ki_flags &= ~IOCB_NOWAIT; - } - - ppos = io_kiocb_update_pos(req); - - ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - - ret = io_iter_do_read(req, &s->iter); - - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; - /* if we can poll, just do that */ - if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) - return -EAGAIN; - /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) - goto done; - /* no retry on NONBLOCK nor RWF_NOWAIT */ - if (req->flags & REQ_F_NOWAIT) - goto done; - ret = 0; - } else if (ret == -EIOCBQUEUED) { - goto out_free; - } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { - /* read all, failed, already did sync or don't want to retry */ - goto done; - } - - /* - * Don't depend on the iter state matching what was consumed, or being - * untouched in case of error. Restore it and we'll advance it - * manually if we need to. - */ - iov_iter_restore(&s->iter, &s->iter_state); - - ret2 = io_setup_async_rw(req, iovec, s, true); - if (ret2) - return ret2; - - iovec = NULL; - rw = req->async_data; - s = &rw->s; - /* - * Now use our persistent iterator and state, if we aren't already. - * We've restored and mapped the iter to match. - */ - - do { - /* - * We end up here because of a partial read, either from - * above or inside this loop. Advance the iter by the bytes - * that were consumed. - */ - iov_iter_advance(&s->iter, ret); - if (!iov_iter_count(&s->iter)) - break; - rw->bytes_done += ret; - iov_iter_save_state(&s->iter, &s->iter_state); - - /* if we can retry, do so with the callbacks armed */ - if (!io_rw_should_retry(req)) { - kiocb->ki_flags &= ~IOCB_WAITQ; - return -EAGAIN; - } - - /* - * Now retry read with the IOCB_WAITQ parts set in the iocb. If - * we get -EIOCBQUEUED, then we'll get a notification when the - * desired page gets unlocked. We can also get a partial read - * here, and if we do, then just retry at the new offset. - */ - ret = io_iter_do_read(req, &s->iter); - if (ret == -EIOCBQUEUED) - return 0; - /* we got some bytes, but not all. retry. */ - kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&s->iter, &s->iter_state); - } while (ret > 0); -done: - kiocb_done(req, ret, issue_flags); -out_free: - /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); - return 0; -} - -static int io_write(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; - struct kiocb *kiocb = &req->rw.kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - ssize_t ret, ret2; - loff_t *ppos; - - if (!req_has_async_data(req)) { - ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - struct io_async_rw *rw = req->async_data; - - s = &rw->s; - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - req->cqe.res = iov_iter_count(&s->iter); - - if (force_nonblock) { - /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) - goto copy_iov; - - /* file path doesn't support NOWAIT for non-direct_IO */ - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; - - kiocb->ki_flags |= IOCB_NOWAIT; - } else { - /* Ensure we clear previously set non-block flag */ - kiocb->ki_flags &= ~IOCB_NOWAIT; - } - - ppos = io_kiocb_update_pos(req); - - ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) - goto out_free; - - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - sb_start_write(file_inode(req->file)->i_sb); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; - - if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); - else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req, &s->iter); - else - ret2 = -EINVAL; - - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - /* no retry on NONBLOCK nor RWF_NOWAIT */ - if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) - goto done; - if (!force_nonblock || ret2 != -EAGAIN) { - /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) - goto copy_iov; -done: - kiocb_done(req, ret2, issue_flags); - } else { -copy_iov: - iov_iter_restore(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, false); - return ret ?: -EAGAIN; - } -out_free: - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); - return ret; -} - -static int io_renameat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_rename *ren = &req->rename; - const char __user *oldf, *newf; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ren->old_dfd = READ_ONCE(sqe->fd); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ren->new_dfd = READ_ONCE(sqe->len); - ren->flags = READ_ONCE(sqe->rename_flags); - - ren->oldpath = getname(oldf); - if (IS_ERR(ren->oldpath)) - return PTR_ERR(ren->oldpath); - - ren->newpath = getname(newf); - if (IS_ERR(ren->newpath)) { - putname(ren->oldpath); - return PTR_ERR(ren->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rename *ren = &req->rename; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, - ren->newpath, ren->flags); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static inline void __io_xattr_finish(struct io_kiocb *req) -{ - struct io_xattr *ix = &req->xattr; - - if (ix->filename) - putname(ix->filename); - - kfree(ix->ctx.kname); - kvfree(ix->ctx.kvalue); -} - -static void io_xattr_finish(struct io_kiocb *req, int ret) -{ - req->flags &= ~REQ_F_NEED_CLEANUP; - - __io_xattr_finish(req); - io_req_complete(req, ret); -} - -static int __io_getxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = &req->xattr; - const char __user *name; - int ret; - - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ix->filename = NULL; - ix->ctx.kvalue = NULL; - name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ix->ctx.size = READ_ONCE(sqe->len); - ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - - if (ix->ctx.flags) - return -EINVAL; - - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); - if (!ix->ctx.kname) - return -ENOMEM; - - ret = strncpy_from_user(ix->ctx.kname->name, name, - sizeof(ix->ctx.kname->name)); - if (!ret || ret == sizeof(ix->ctx.kname->name)) - ret = -ERANGE; - if (ret < 0) { - kfree(ix->ctx.kname); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_fgetxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_getxattr_prep(req, sqe); -} - -static int io_getxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = &req->xattr; - const char __user *path; - int ret; - - ret = __io_getxattr_prep(req, sqe); - if (ret) - return ret; - - path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } - - return ret; -} - -static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = &req->xattr; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), - req->file->f_path.dentry, - &ix->ctx); - - io_xattr_finish(req, ret); - return 0; -} - -static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = &req->xattr; - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = do_getxattr(mnt_user_ns(path.mnt), - path.dentry, - &ix->ctx); - - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - - io_xattr_finish(req, ret); - return 0; -} - -static int __io_setxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = &req->xattr; - const char __user *name; - int ret; - - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ix->filename = NULL; - name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ix->ctx.kvalue = NULL; - ix->ctx.size = READ_ONCE(sqe->len); - ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); - if (!ix->ctx.kname) - return -ENOMEM; - - ret = setxattr_copy(name, &ix->ctx); - if (ret) { - kfree(ix->ctx.kname); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_setxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = &req->xattr; - const char __user *path; - int ret; - - ret = __io_setxattr_prep(req, sqe); - if (ret) - return ret; - - path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } - - return ret; -} - -static int io_fsetxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_setxattr_prep(req, sqe); -} - -static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, - struct path *path) -{ - struct io_xattr *ix = &req->xattr; - int ret; - - ret = mnt_want_write(path->mnt); - if (!ret) { - ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); - mnt_drop_write(path->mnt); - } - - return ret; -} - -static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = __io_setxattr(req, issue_flags, &req->file->f_path); - io_xattr_finish(req, ret); - - return 0; -} - -static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = &req->xattr; - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = __io_setxattr(req, issue_flags, &path); - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - - io_xattr_finish(req, ret); - return 0; -} - -static int io_unlinkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_unlink *un = &req->unlink; - const char __user *fname; - - if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - un->dfd = READ_ONCE(sqe->fd); - - un->flags = READ_ONCE(sqe->unlink_flags); - if (un->flags & ~AT_REMOVEDIR) - return -EINVAL; - - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - un->filename = getname(fname); - if (IS_ERR(un->filename)) - return PTR_ERR(un->filename); - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_unlink *un = &req->unlink; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (un->flags & AT_REMOVEDIR) - ret = do_rmdir(un->dfd, un->filename); - else - ret = do_unlinkat(un->dfd, un->filename); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static int io_mkdirat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_mkdir *mkd = &req->mkdir; - const char __user *fname; - - if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - mkd->dfd = READ_ONCE(sqe->fd); - mkd->mode = READ_ONCE(sqe->len); - - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - mkd->filename = getname(fname); - if (IS_ERR(mkd->filename)) - return PTR_ERR(mkd->filename); - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_mkdir *mkd = &req->mkdir; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static int io_symlinkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_symlink *sl = &req->symlink; - const char __user *oldpath, *newpath; - - if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - sl->new_dfd = READ_ONCE(sqe->fd); - oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - - sl->oldpath = getname(oldpath); - if (IS_ERR(sl->oldpath)) - return PTR_ERR(sl->oldpath); - - sl->newpath = getname(newpath); - if (IS_ERR(sl->newpath)) { - putname(sl->oldpath); - return PTR_ERR(sl->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_symlink *sl = &req->symlink; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static int io_linkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_hardlink *lnk = &req->hardlink; - const char __user *oldf, *newf; - - if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - lnk->old_dfd = READ_ONCE(sqe->fd); - lnk->new_dfd = READ_ONCE(sqe->len); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - lnk->flags = READ_ONCE(sqe->hardlink_flags); - - lnk->oldpath = getname(oldf); - if (IS_ERR(lnk->oldpath)) - return PTR_ERR(lnk->oldpath); - - lnk->newpath = getname(newf); - if (IS_ERR(lnk->newpath)) { - putname(lnk->oldpath); - return PTR_ERR(lnk->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_hardlink *lnk = &req->hardlink; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, - lnk->newpath, lnk->flags); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) -{ - req->uring_cmd.task_work_cb(&req->uring_cmd); -} - -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)) -{ - struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); - - req->uring_cmd.task_work_cb = task_work_cb; - req->io_task_work.func = io_uring_cmd_work; - io_req_task_prio_work_add(req); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); - -/* - * Called by consumers of io_uring_cmd, if they originally returned - * -EIOCBQUEUED upon receiving the command. - */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) -{ - struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); - - if (ret < 0) - req_set_fail(req); - if (req->ctx->flags & IORING_SETUP_CQE32) - __io_req_complete32(req, 0, ret, 0, res2, 0); - else - io_req_complete(req, ret); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_done); - -static int io_uring_cmd_prep_async(struct io_kiocb *req) -{ - size_t cmd_size; - - cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); - - memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); - return 0; -} - -static int io_uring_cmd_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_uring_cmd *ioucmd = &req->uring_cmd; - - if (sqe->rw_flags) - return -EINVAL; - ioucmd->cmd = sqe->cmd; - ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return 0; -} - -static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_uring_cmd *ioucmd = &req->uring_cmd; - struct io_ring_ctx *ctx = req->ctx; - struct file *file = req->file; - int ret; - - if (!req->file->f_op->uring_cmd) - return -EOPNOTSUPP; - - if (ctx->flags & IORING_SETUP_SQE128) - issue_flags |= IO_URING_F_SQE128; - if (ctx->flags & IORING_SETUP_CQE32) - issue_flags |= IO_URING_F_CQE32; - if (ctx->flags & IORING_SETUP_IOPOLL) - issue_flags |= IO_URING_F_IOPOLL; - - if (req_has_async_data(req)) - ioucmd->cmd = req->async_data; - - ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - if (!req_has_async_data(req)) { - if (io_alloc_async_data(req)) - return -ENOMEM; - io_uring_cmd_prep_async(req); - } - return -EAGAIN; - } - - if (ret != -EIOCBQUEUED) - io_uring_cmd_done(ioucmd, ret, 0); - return 0; -} - -static int __io_splice_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_splice *sp = &req->splice; - unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - - sp->len = READ_ONCE(sqe->len); - sp->flags = READ_ONCE(sqe->splice_flags); - if (unlikely(sp->flags & ~valid_flags)) - return -EINVAL; - sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); - return 0; -} - -static int io_tee_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) - return -EINVAL; - return __io_splice_prep(req, sqe); -} - -static int io_tee(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_splice *sp = &req->splice; - struct file *out = sp->file_out; - unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; - struct file *in; - long ret = 0; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); - if (!in) { - ret = -EBADF; - goto done; - } - - if (sp->len) - ret = do_tee(in, out, sp->len, flags); - - if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); -done: - if (ret != sp->len) - req_set_fail(req); - __io_req_complete(req, 0, ret, 0); - return 0; -} - -static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_splice *sp = &req->splice; - - sp->off_in = READ_ONCE(sqe->splice_off_in); - sp->off_out = READ_ONCE(sqe->off); - return __io_splice_prep(req, sqe); -} - -static int io_splice(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_splice *sp = &req->splice; - struct file *out = sp->file_out; - unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; - loff_t *poff_in, *poff_out; - struct file *in; - long ret = 0; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); - if (!in) { - ret = -EBADF; - goto done; - } - - poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; - poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; - - if (sp->len) - ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); - - if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); -done: - if (ret != sp->len) - req_set_fail(req); - __io_req_complete(req, 0, ret, 0); - return 0; -} - -static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - /* - * If the ring is setup with CQE32, relay back addr/addr - */ - if (req->ctx->flags & IORING_SETUP_CQE32) { - req->nop.extra1 = READ_ONCE(sqe->addr); - req->nop.extra2 = READ_ONCE(sqe->addr2); - } - - return 0; -} - -/* - * IORING_OP_NOP just posts a completion event, nothing else. - */ -static int io_nop(struct io_kiocb *req, unsigned int issue_flags) -{ - unsigned int cflags; - void __user *buf; - - if (req->flags & REQ_F_BUFFER_SELECT) { - size_t len = 1; - - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - } - - cflags = io_put_kbuf(req, issue_flags); - if (!(req->ctx->flags & IORING_SETUP_CQE32)) - __io_req_complete(req, issue_flags, 0, cflags); - else - __io_req_complete32(req, issue_flags, 0, cflags, - req->nop.extra1, req->nop.extra2); - return 0; -} - -static int io_msg_ring_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || - sqe->buf_index || sqe->personality)) - return -EINVAL; - - req->msg.user_data = READ_ONCE(sqe->off); - req->msg.len = READ_ONCE(sqe->len); - return 0; -} - -static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *target_ctx; - struct io_msg *msg = &req->msg; - bool filled; - int ret; - - ret = -EBADFD; - if (req->file->f_op != &io_uring_fops) - goto done; - - ret = -EOVERFLOW; - target_ctx = req->file->private_data; - - spin_lock(&target_ctx->completion_lock); - filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); - io_commit_cqring(target_ctx); - spin_unlock(&target_ctx->completion_lock); - - if (filled) { - io_cqring_ev_posted(target_ctx); - ret = 0; - } - -done: - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - /* put file to avoid an attempt to IOPOLL the req */ - io_put_file(req->file); - req->file = NULL; - return 0; -} - -static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - req->sync.flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) - return -EINVAL; - - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->len); - return 0; -} - -static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) -{ - loff_t end = req->sync.off + req->sync.len; - int ret; - - /* fsync always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = vfs_fsync_range(req->file, req->sync.off, - end > 0 ? end : LLONG_MAX, - req->sync.flags & IORING_FSYNC_DATASYNC); - io_req_complete(req, ret); - return 0; -} - -static int io_fallocate_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->addr); - req->sync.mode = READ_ONCE(sqe->len); - return 0; -} - -static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) -{ - int ret; - - /* fallocate always requiring blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, - req->sync.len); - if (ret >= 0) - fsnotify_modify(req->file); - io_req_complete(req, ret); - return 0; -} - -static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - const char __user *fname; - int ret; - - if (unlikely(sqe->buf_index)) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - /* open.how should be already initialised */ - if (!(req->open.how.flags & O_PATH) && force_o_largefile()) - req->open.how.flags |= O_LARGEFILE; - - req->open.dfd = READ_ONCE(sqe->fd); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.filename = getname(fname); - if (IS_ERR(req->open.filename)) { - ret = PTR_ERR(req->open.filename); - req->open.filename = NULL; - return ret; - } - - req->open.file_slot = READ_ONCE(sqe->file_index); - if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) - return -EINVAL; - - req->open.nofile = rlimit(RLIMIT_NOFILE); - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - u64 mode = READ_ONCE(sqe->len); - u64 flags = READ_ONCE(sqe->open_flags); - - req->open.how = build_open_how(flags, mode); - return __io_openat_prep(req, sqe); -} - -static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct open_how __user *how; - size_t len; - int ret; - - how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - len = READ_ONCE(sqe->len); - if (len < OPEN_HOW_SIZE_VER0) - return -EINVAL; - - ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, - len); - if (ret) - return ret; - - return __io_openat_prep(req, sqe); -} - -static int io_file_bitmap_get(struct io_ring_ctx *ctx) -{ - struct io_file_table *table = &ctx->file_table; - unsigned long nr = ctx->nr_user_files; - int ret; - - do { - ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); - if (ret != nr) - return ret; - - if (!table->alloc_hint) - break; - - nr = table->alloc_hint; - table->alloc_hint = 0; - } while (1); - - return -ENFILE; -} - -/* - * Note when io_fixed_fd_install() returns error value, it will ensure - * fput() is called correspondingly. - */ -static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot) -{ - bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; - struct io_ring_ctx *ctx = req->ctx; - int ret; - - io_ring_submit_lock(ctx, issue_flags); - - if (alloc_slot) { - ret = io_file_bitmap_get(ctx); - if (unlikely(ret < 0)) - goto err; - file_slot = ret; - } else { - file_slot--; - } - - ret = io_install_fixed_file(req, file, issue_flags, file_slot); - if (!ret && alloc_slot) - ret = file_slot; -err: - io_ring_submit_unlock(ctx, issue_flags); - if (unlikely(ret < 0)) - fput(file); - return ret; -} - -static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) -{ - struct open_flags op; - struct file *file; - bool resolve_nonblock, nonblock_set; - bool fixed = !!req->open.file_slot; - int ret; - - ret = build_open_flags(&req->open.how, &op); - if (ret) - goto err; - nonblock_set = op.open_flag & O_NONBLOCK; - resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; - if (issue_flags & IO_URING_F_NONBLOCK) { - /* - * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN - */ - if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) - return -EAGAIN; - op.lookup_flags |= LOOKUP_CACHED; - op.open_flag |= O_NONBLOCK; - } - - if (!fixed) { - ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); - if (ret < 0) - goto err; - } - - file = do_filp_open(req->open.dfd, req->open.filename, &op); - if (IS_ERR(file)) { - /* - * We could hang on to this 'fd' on retrying, but seems like - * marginal gain for something that is now known to be a slower - * path. So just put it, and we'll get a new one when we retry. - */ - if (!fixed) - put_unused_fd(ret); - - ret = PTR_ERR(file); - /* only retry if RESOLVE_CACHED wasn't already set by application */ - if (ret == -EAGAIN && - (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) - return -EAGAIN; - goto err; - } - - if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) - file->f_flags &= ~O_NONBLOCK; - fsnotify_open(file); - - if (!fixed) - fd_install(ret, file); - else - ret = io_fixed_fd_install(req, issue_flags, file, - req->open.file_slot); -err: - putname(req->open.filename); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_openat(struct io_kiocb *req, unsigned int issue_flags) -{ - return io_openat2(req, issue_flags); -} - -static int io_remove_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_provide_buf *p = &req->pbuf; - u64 tmp; - - if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || - sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -EINVAL; - - memset(p, 0, sizeof(*p)); - p->nbufs = tmp; - p->bgid = READ_ONCE(sqe->buf_group); - return 0; -} - -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) -{ - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->buf_nr_pages) { - int j; - - i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - return i; - } - - /* the head kbuf is the list itself */ - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - - nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&nxt->list); - if (++i == nbufs) - return i; - cond_resched(); - } - i++; - - return i; -} - -static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = &req->pbuf; - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->buf_nr_pages) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - if (ret < 0) - req_set_fail(req); - - /* complete before unlock, IOPOLL may need the lock */ - __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, issue_flags); - return 0; -} - -static int io_provide_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - unsigned long size, tmp_check; - struct io_provide_buf *p = &req->pbuf; - u64 tmp; - - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -E2BIG; - p->nbufs = tmp; - p->addr = READ_ONCE(sqe->addr); - p->len = READ_ONCE(sqe->len); - - if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, - &size)) - return -EOVERFLOW; - if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) - return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; - if (!access_ok(u64_to_user_ptr(p->addr), size)) - return -EFAULT; - - p->bgid = READ_ONCE(sqe->buf_group); - tmp = READ_ONCE(sqe->off); - if (tmp > USHRT_MAX) - return -E2BIG; - p->bid = tmp; - return 0; -} - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *buf; - struct page *page; - int bufs_in_page; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * page worth of buffer entries and add those to our freelist. - */ - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) - return -ENOMEM; - - list_add(&page->lru, &ctx->io_buffers_pages); - - buf = page_address(page); - bufs_in_page = PAGE_SIZE / sizeof(*buf); - while (bufs_in_page) { - list_add_tail(&buf->list, &ctx->io_buffers_cache); - buf++; - bufs_in_page--; - } - - return 0; -} - -static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, - struct io_buffer_list *bl) -{ - struct io_buffer *buf; - u64 addr = pbuf->addr; - int i, bid = pbuf->bid; - - for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) - break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); - buf->addr = addr; - buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); - buf->bid = bid; - buf->bgid = pbuf->bgid; - addr += pbuf->len; - bid++; - cond_resched(); - } - - return i ? 0 : -ENOMEM; -} - -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) -{ - int i; - - ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), - GFP_KERNEL); - if (!ctx->io_bl) - return -ENOMEM; - - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); - ctx->io_bl[i].bgid = i; - } - - return 0; -} - -static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = &req->pbuf; - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) { - ret = -ENOMEM; - goto err; - } - INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); - if (ret) { - kfree(bl); - goto err; - } - } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->buf_nr_pages) { - ret = -EINVAL; - goto err; - } - - ret = io_add_buffers(ctx, p, bl); -err: - if (ret < 0) - req_set_fail(req); - /* complete before unlock, IOPOLL may need the lock */ - __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, issue_flags); - return 0; -} - -static int io_epoll_ctl_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ -#if defined(CONFIG_EPOLL) - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - - req->epoll.epfd = READ_ONCE(sqe->fd); - req->epoll.op = READ_ONCE(sqe->len); - req->epoll.fd = READ_ONCE(sqe->off); - - if (ep_op_has_event(req->epoll.op)) { - struct epoll_event __user *ev; - - ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) - return -EFAULT; - } - - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) -{ -#if defined(CONFIG_EPOLL) - struct io_epoll *ie = &req->epoll; - int ret; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ -#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) - return -EINVAL; - - req->madvise.addr = READ_ONCE(sqe->addr); - req->madvise.len = READ_ONCE(sqe->len); - req->madvise.advice = READ_ONCE(sqe->fadvise_advice); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) -{ -#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - struct io_madvise *ma = &req->madvise; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - io_req_complete(req, ret); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) - return -EINVAL; - - req->fadvise.offset = READ_ONCE(sqe->off); - req->fadvise.len = READ_ONCE(sqe->len); - req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); - return 0; -} - -static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_fadvise *fa = &req->fadvise; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) { - switch (fa->advice) { - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_SEQUENTIAL: - break; - default: - return -EAGAIN; - } - } - - ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - const char __user *path; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; - - req->statx.dfd = READ_ONCE(sqe->fd); - req->statx.mask = READ_ONCE(sqe->len); - path = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - req->statx.flags = READ_ONCE(sqe->statx_flags); - - req->statx.filename = getname_flags(path, - getname_statx_lookup_flags(req->statx.flags), - NULL); - - if (IS_ERR(req->statx.filename)) { - int ret = PTR_ERR(req->statx.filename); - - req->statx.filename = NULL; - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_statx(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_statx *ctx = &req->statx; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, - ctx->buffer); - io_req_complete(req, ret); - return 0; -} - -static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (sqe->off || sqe->addr || sqe->len || sqe->buf_index) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; - - req->close.fd = READ_ONCE(sqe->fd); - req->close.file_slot = READ_ONCE(sqe->file_index); - req->close.flags = READ_ONCE(sqe->close_flags); - if (req->close.flags & ~IORING_CLOSE_FD_AND_FILE_SLOT) - return -EINVAL; - if (!(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT) && - req->close.file_slot && req->close.fd) - return -EINVAL; - - return 0; -} - -static int io_close(struct io_kiocb *req, unsigned int issue_flags) -{ - struct files_struct *files = current->files; - struct io_close *close = &req->close; - struct fdtable *fdt; - struct file *file; - int ret = -EBADF; - - if (req->close.file_slot) { - ret = io_close_fixed(req, issue_flags); - if (ret || !(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT)) - goto err; - } - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (close->fd >= fdt->max_fds) { - spin_unlock(&files->file_lock); - goto err; - } - file = rcu_dereference_protected(fdt->fd[close->fd], - lockdep_is_held(&files->file_lock)); - if (!file || file->f_op == &io_uring_fops) { - spin_unlock(&files->file_lock); - goto err; - } - - /* if the file has a flush method, be safe and punt to async */ - if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { - spin_unlock(&files->file_lock); - return -EAGAIN; - } - - file = __close_fd_get_file(close->fd); - spin_unlock(&files->file_lock); - if (!file) - goto err; - - /* No ->flush() or already async, safely close from here */ - ret = filp_close(file, current->files); -err: - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->len); - req->sync.flags = READ_ONCE(sqe->sync_range_flags); - return 0; -} - -static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) -{ - int ret; - - /* sync_file_range always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = sync_file_range(req->file, req->sync.off, req->sync.len, - req->sync.flags); - io_req_complete(req, ret); - return 0; -} - -#if defined(CONFIG_NET) -static int io_shutdown_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - req->shutdown.how = READ_ONCE(sqe->len); - return 0; -} - -static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) -{ - struct socket *sock; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = __sys_shutdown_sock(sock, req->shutdown.how); - io_req_complete(req, ret); - return 0; -} - -static bool io_net_retry(struct socket *sock, int flags) -{ - if (!(flags & MSG_WAITALL)) - return false; - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; -} - -static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg) -{ - struct io_async_msghdr *async_msg = req->async_data; - - if (async_msg) - return -EAGAIN; - if (io_alloc_async_data(req)) { - kfree(kmsg->free_iov); - return -ENOMEM; - } - async_msg = req->async_data; - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(async_msg, kmsg, sizeof(*kmsg)); - async_msg->msg.msg_name = &async_msg->addr; - /* if were using fast_iov, set it to the new one */ - if (!async_msg->free_iov) - async_msg->msg.msg_iter.iov = async_msg->fast_iov; - - return -EAGAIN; -} - -static int io_sendmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - iomsg->msg.msg_name = &iomsg->addr; - iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, - req->sr_msg.msg_flags, &iomsg->free_iov); -} - -static int io_sendmsg_prep_async(struct io_kiocb *req) -{ - int ret; - - ret = io_sendmsg_copy_hdr(req, req->async_data); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; -} - -static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = &req->sr_msg; - - if (unlikely(sqe->file_index)) - return -EINVAL; - if (unlikely(sqe->addr2 || sqe->file_index)) - return -EINVAL; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->addr2); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) - return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; - if (sr->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - sr->done_io = 0; - return 0; -} - -static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_async_msghdr iomsg, *kmsg; - struct io_sr_msg *sr = &req->sr_msg; - struct socket *sock; - unsigned flags; - int min_ret = 0; - int ret; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - - if (ret < min_ret) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); - } - req_set_fail(req); - } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_send(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - struct socket *sock; - unsigned flags; - int min_ret = 0; - int ret; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - return ret; - - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); - - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (ret < min_ret) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->len -= ret; - sr->buf += ret; - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return -EAGAIN; - } - req_set_fail(req); - } - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = &req->sr_msg; - struct iovec __user *uiov; - size_t iov_len; - int ret; - - ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, - &iomsg->uaddr, &uiov, &iov_len); - if (ret) - return ret; - - if (req->flags & REQ_F_BUFFER_SELECT) { - if (iov_len > 1) - return -EINVAL; - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) - return -EFAULT; - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, - false); - if (ret > 0) - ret = 0; - } - - return ret; -} - -#ifdef CONFIG_COMPAT -static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = &req->sr_msg; - struct compat_iovec __user *uiov; - compat_uptr_t ptr; - compat_size_t len; - int ret; - - ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, - &ptr, &len); - if (ret) - return ret; - - uiov = compat_ptr(ptr); - if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - - if (len > 1) - return -EINVAL; - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; - iomsg->free_iov = NULL; - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, (struct iovec __user *)uiov, len, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); - if (ret < 0) - return ret; - } - - return 0; -} -#endif - -static int io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - iomsg->msg.msg_name = &iomsg->addr; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, iomsg); -#endif - - return __io_recvmsg_copy_hdr(req, iomsg); -} - -static int io_recvmsg_prep_async(struct io_kiocb *req) -{ - int ret; - - ret = io_recvmsg_copy_hdr(req, req->async_data); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; -} - -static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = &req->sr_msg; - - if (unlikely(sqe->file_index)) - return -EINVAL; - if (unlikely(sqe->addr2 || sqe->file_index)) - return -EINVAL; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->addr2); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) - return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; - if (sr->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - sr->done_io = 0; - return 0; -} - -static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_async_msghdr iomsg, *kmsg; - struct io_sr_msg *sr = &req->sr_msg; - struct socket *sock; - unsigned int cflags; - unsigned flags; - int ret, min_ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); - - if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &sr->len, issue_flags); - if (!buf) - return -ENOBUFS; - kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = sr->len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - sr->len); - } - - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - kmsg->msg.msg_get_inq = 1; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); - if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); - } - req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { - req_set_fail(req); - } - - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - cflags = io_put_kbuf(req, issue_flags); - if (kmsg->msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - __io_req_complete(req, issue_flags, ret, cflags); - return 0; -} - -static int io_recv(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct socket *sock; - struct iovec iov; - unsigned int cflags; - unsigned flags; - int ret, min_ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &sr->len, issue_flags); - if (!buf) - return -ENOBUFS; - sr->buf = buf; - } - - ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - goto out_free; - - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; - msg.msg_flags = 0; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); - - ret = sock_recvmsg(sock, &msg, flags); - if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->len -= ret; - sr->buf += ret; - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return -EAGAIN; - } - req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { -out_free: - req_set_fail(req); - } - - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - cflags = io_put_kbuf(req, issue_flags); - if (msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - __io_req_complete(req, issue_flags, ret, cflags); - return 0; -} - -static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_accept *accept = &req->accept; - unsigned flags; - - if (sqe->len || sqe->buf_index) - return -EINVAL; - - accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - accept->flags = READ_ONCE(sqe->accept_flags); - accept->nofile = rlimit(RLIMIT_NOFILE); - flags = READ_ONCE(sqe->ioprio); - if (flags & ~IORING_ACCEPT_MULTISHOT) - return -EINVAL; - - accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot) { - if (accept->flags & SOCK_CLOEXEC) - return -EINVAL; - if (flags & IORING_ACCEPT_MULTISHOT && - accept->file_slot != IORING_FILE_INDEX_ALLOC) - return -EINVAL; - } - if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) - accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - if (flags & IORING_ACCEPT_MULTISHOT) - req->flags |= REQ_F_APOLL_MULTISHOT; - return 0; -} - -static int io_accept(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_accept *accept = &req->accept; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; - bool fixed = !!accept->file_slot; - struct file *file; - int ret, fd; - -retry: - if (!fixed) { - fd = __get_unused_fd_flags(accept->flags, accept->nofile); - if (unlikely(fd < 0)) - return fd; - } - file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, - accept->flags); - if (IS_ERR(file)) { - if (!fixed) - put_unused_fd(fd); - ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) { - /* - * if it's multishot and polled, we don't need to - * return EAGAIN to arm the poll infra since it - * has already been done - */ - if ((req->flags & IO_APOLL_MULTI_POLLED) == - IO_APOLL_MULTI_POLLED) - ret = 0; - return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } else if (!fixed) { - fd_install(fd, file); - ret = fd; - } else { - ret = io_fixed_fd_install(req, issue_flags, file, - accept->file_slot); - } - - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - __io_req_complete(req, issue_flags, ret, 0); - return 0; - } - if (ret >= 0) { - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, - IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - goto retry; - } - ret = -ECANCELED; - } - - return ret; -} - -static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_socket *sock = &req->sock; - - if (sqe->addr || sqe->rw_flags || sqe->buf_index) - return -EINVAL; - - sock->domain = READ_ONCE(sqe->fd); - sock->type = READ_ONCE(sqe->off); - sock->protocol = READ_ONCE(sqe->len); - sock->file_slot = READ_ONCE(sqe->file_index); - sock->nofile = rlimit(RLIMIT_NOFILE); - - sock->flags = sock->type & ~SOCK_TYPE_MASK; - if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) - return -EINVAL; - if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - return 0; -} - -static int io_socket(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_socket *sock = &req->sock; - bool fixed = !!sock->file_slot; - struct file *file; - int ret, fd; - - if (!fixed) { - fd = __get_unused_fd_flags(sock->flags, sock->nofile); - if (unlikely(fd < 0)) - return fd; - } - file = __sys_socket_file(sock->domain, sock->type, sock->protocol); - if (IS_ERR(file)) { - if (!fixed) - put_unused_fd(fd); - ret = PTR_ERR(file); - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } else if (!fixed) { - fd_install(fd, file); - ret = fd; - } else { - ret = io_fixed_fd_install(req, issue_flags, file, - sock->file_slot); - } - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_connect_prep_async(struct io_kiocb *req) -{ - struct io_async_connect *io = req->async_data; - struct io_connect *conn = &req->connect; - - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} - -static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_connect *conn = &req->connect; - - if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - conn->addr_len = READ_ONCE(sqe->addr2); - return 0; -} - -static int io_connect(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_async_connect __io, *io; - unsigned file_flags; - int ret; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - if (req_has_async_data(req)) { - io = req->async_data; - } else { - ret = move_addr_to_kernel(req->connect.addr, - req->connect.addr_len, - &__io.address); - if (ret) - goto out; - io = &__io; - } - - file_flags = force_nonblock ? O_NONBLOCK : 0; - - ret = __sys_connect_file(req->file, &io->address, - req->connect.addr_len, file_flags); - if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; - } - memcpy(req->async_data, &__io, sizeof(__io)); - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; -out: - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} -#else /* !CONFIG_NET */ -#define IO_NETOP_FN(op) \ -static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ -{ \ - return -EOPNOTSUPP; \ -} - -#define IO_NETOP_PREP(op) \ -IO_NETOP_FN(op) \ -static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ -{ \ - return -EOPNOTSUPP; \ -} \ - -#define IO_NETOP_PREP_ASYNC(op) \ -IO_NETOP_PREP(op) \ -static int io_##op##_prep_async(struct io_kiocb *req) \ -{ \ - return -EOPNOTSUPP; \ -} - -IO_NETOP_PREP_ASYNC(sendmsg); -IO_NETOP_PREP_ASYNC(recvmsg); -IO_NETOP_PREP_ASYNC(connect); -IO_NETOP_PREP(accept); -IO_NETOP_PREP(socket); -IO_NETOP_PREP(shutdown); -IO_NETOP_FN(send); -IO_NETOP_FN(recv); -#endif /* CONFIG_NET */ - -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int nr_entries; - int error; -}; - -#define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK GENMASK(30, 0) - -/* - * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can - * bump it and acquire ownership. It's disallowed to modify requests while not - * owning it, that prevents from races for enqueueing task_work's and b/w - * arming poll and wakeups. - */ -static inline bool io_poll_get_ownership(struct io_kiocb *req) -{ - return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); -} - -static void io_poll_mark_cancelled(struct io_kiocb *req) -{ - atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); -} - -static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) -{ - /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ - if (req->opcode == IORING_OP_POLL_ADD) - return req->async_data; - return req->apoll->double_poll; -} - -static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) -{ - if (req->opcode == IORING_OP_POLL_ADD) - return &req->poll; - return &req->apoll->poll; -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); -} - -static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, - wait_queue_func_t wake_func) -{ - poll->head = NULL; -#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) - /* mask in events that we always want/need */ - poll->events = events | IO_POLL_UNMASK; - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); -} - -static inline void io_poll_remove_entry(struct io_poll_iocb *poll) -{ - struct wait_queue_head *head = smp_load_acquire(&poll->head); - - if (head) { - spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; - spin_unlock_irq(&head->lock); - } -} - -static void io_poll_remove_entries(struct io_kiocb *req) -{ - /* - * Nothing to do if neither of those flags are set. Avoid dipping - * into the poll/apoll/double cachelines if we can. - */ - if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) - return; - - /* - * While we hold the waitqueue lock and the waitqueue is nonempty, - * wake_up_pollfree() will wait for us. However, taking the waitqueue - * lock in the first place can race with the waitqueue being freed. - * - * We solve this as eventpoll does: by taking advantage of the fact that - * all users of wake_up_pollfree() will RCU-delay the actual free. If - * we enter rcu_read_lock() and see that the pointer to the queue is - * non-NULL, we can then lock it without the memory being freed out from - * under us. - * - * Keep holding rcu_read_lock() as long as we hold the queue lock, in - * case the caller deletes the entry from the queue, leaving it empty. - * In that case, only RCU prevents the queue memory from being freed. - */ - rcu_read_lock(); - if (req->flags & REQ_F_SINGLE_POLL) - io_poll_remove_entry(io_poll_get_single(req)); - if (req->flags & REQ_F_DOUBLE_POLL) - io_poll_remove_entry(io_poll_get_double(req)); - rcu_read_unlock(); -} - -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags); -/* - * All poll tw should go through this. Checks for poll events, manages - * references, does rewait, etc. - * - * Returns a negative error on failure. >0 when no action require, which is - * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->cqe.res. - */ -static int io_poll_check_events(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int v, ret; - - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) - return -ECANCELED; - - do { - v = atomic_read(&req->poll_refs); - - /* tw handler should be the owner, and so have some references */ - if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return 0; - if (v & IO_POLL_CANCEL_FLAG) - return -ECANCELED; - - if (!req->cqe.res) { - struct poll_table_struct pt = { ._key = req->apoll_events }; - req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; - } - - if ((unlikely(!req->cqe.res))) - continue; - if (req->apoll_events & EPOLLONESHOT) - return 0; - - /* multishot, just fill a CQE and proceed */ - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - __poll_t mask = mangle_poll(req->cqe.res & - req->apoll_events); - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - continue; - } - return -ECANCELED; - } - - io_tw_lock(req->ctx, locked); - if (unlikely(req->task->flags & PF_EXITING)) - return -EFAULT; - ret = io_issue_sqe(req, - IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - if (ret) - return ret; - - /* - * Release all references, retry if someone tried to restart - * task_work while we were executing it. - */ - } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); - - return 1; -} - -static void io_poll_task_func(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_poll_check_events(req, locked); - if (ret > 0) - return; - - if (!ret) { - req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); - } else { - req->cqe.res = ret; - req_set_fail(req); - } - - io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - __io_req_complete_post(req, req->cqe.res, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static void io_apoll_task_func(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_poll_check_events(req, locked); - if (ret > 0) - return; - - io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - spin_unlock(&ctx->completion_lock); - - if (!ret) - io_req_task_submit(req, locked); - else - io_req_complete_failed(req, ret); -} - -static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) -{ - req->cqe.res = mask; - /* - * This is useful for poll that is armed on behalf of another - * request, and where the wakeup path could be on a different - * CPU. We want to avoid pulling in req->apoll->events for that - * case. - */ - req->apoll_events = events; - if (req->opcode == IORING_OP_POLL_ADD) - req->io_task_work.func = io_poll_task_func; - else - req->io_task_work.func = io_apoll_task_func; - - trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); - io_req_task_work_add(req); -} - -static inline void io_poll_execute(struct io_kiocb *req, int res, - __poll_t events) -{ - if (io_poll_get_ownership(req)) - __io_poll_execute(req, res, events); -} - -static void io_poll_cancel_req(struct io_kiocb *req) -{ - io_poll_mark_cancelled(req); - /* kick tw, which should complete the request */ - io_poll_execute(req, 0, 0); -} - -#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) -#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) -#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) - -static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wqe_to_req(wait); - struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, - wait); - __poll_t mask = key_to_poll(key); - - if (unlikely(mask & POLLFREE)) { - io_poll_mark_cancelled(req); - /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0, poll->events); - - /* - * If the waitqueue is being freed early but someone is already - * holds ownership over it, we have to tear down the request as - * best we can. That means immediately removing the request from - * its waitqueue and preventing all further accesses to the - * waitqueue via the request. - */ - list_del_init(&poll->wait.entry); - - /* - * Careful: this *must* be the last step, since as soon - * as req->head is NULL'ed out, the request can be - * completed and freed, since aio_poll_complete_work() - * will no longer need to take the waitqueue lock. - */ - smp_store_release(&poll->head, NULL); - return 1; - } - - /* for instances that support it check for an event match first */ - if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) - return 0; - - if (io_poll_get_ownership(req)) { - /* optional, saves extra locking for removal in tw handler */ - if (mask && poll->events & EPOLLONESHOT) { - list_del_init(&poll->wait.entry); - poll->head = NULL; - if (wqe_is_double(wait)) - req->flags &= ~REQ_F_DOUBLE_POLL; - else - req->flags &= ~REQ_F_SINGLE_POLL; - } - __io_poll_execute(req, mask, poll->events); - } - return 1; -} - -static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, - struct wait_queue_head *head, - struct io_poll_iocb **poll_ptr) -{ - struct io_kiocb *req = pt->req; - unsigned long wqe_private = (unsigned long) req; - - /* - * The file being polled uses multiple waitqueues for poll handling - * (e.g. one for read, one for write). Setup a separate io_poll_iocb - * if this happens. - */ - if (unlikely(pt->nr_entries)) { - struct io_poll_iocb *first = poll; - - /* double add on the same waitqueue head, ignore */ - if (first->head == head) - return; - /* already have a 2nd entry, fail a third attempt */ - if (*poll_ptr) { - if ((*poll_ptr)->head == head) - return; - pt->error = -EINVAL; - return; - } - - poll = kmalloc(sizeof(*poll), GFP_ATOMIC); - if (!poll) { - pt->error = -ENOMEM; - return; - } - /* mark as double wq entry */ - wqe_private |= 1; - req->flags |= REQ_F_DOUBLE_POLL; - io_init_poll_iocb(poll, first->events, first->wait.func); - *poll_ptr = poll; - if (req->opcode == IORING_OP_POLL_ADD) - req->flags |= REQ_F_ASYNC_DATA; - } - - req->flags |= REQ_F_SINGLE_POLL; - pt->nr_entries++; - poll->head = head; - poll->wait.private = (void *) wqe_private; - - if (poll->events & EPOLLEXCLUSIVE) - add_wait_queue_exclusive(head, &poll->wait); - else - add_wait_queue(head, &poll->wait); -} - -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - - __io_queue_proc(&pt->req->poll, pt, head, - (struct io_poll_iocb **) &pt->req->async_data); -} - -static int __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll_iocb *poll, - struct io_poll_table *ipt, __poll_t mask) -{ - struct io_ring_ctx *ctx = req->ctx; - int v; - - INIT_HLIST_NODE(&req->hash_node); - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - io_init_poll_iocb(poll, mask, io_poll_wake); - poll->file = req->file; - - ipt->pt._key = mask; - ipt->req = req; - ipt->error = 0; - ipt->nr_entries = 0; - - /* - * Take the ownership to delay any tw execution up until we're done - * with poll arming. see io_poll_get_ownership(). - */ - atomic_set(&req->poll_refs, 1); - mask = vfs_poll(req->file, &ipt->pt) & poll->events; - - if (mask && (poll->events & EPOLLONESHOT)) { - io_poll_remove_entries(req); - /* no one else has access to the req, forget about the ref */ - return mask; - } - if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { - io_poll_remove_entries(req); - if (!ipt->error) - ipt->error = -EINVAL; - return 0; - } - - spin_lock(&ctx->completion_lock); - io_poll_req_insert(req); - spin_unlock(&ctx->completion_lock); - - if (mask) { - /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) - poll->events |= EPOLLONESHOT; - __io_poll_execute(req, mask, poll->events); - return 0; - } - - /* - * Release ownership. If someone tried to queue a tw while it was - * locked, kick it off for them. - */ - v = atomic_dec_return(&req->poll_refs); - if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0, poll->events); - return 0; -} - -static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct async_poll *apoll = pt->req->apoll; - - __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); -} - -enum { - IO_APOLL_OK, - IO_APOLL_ABORTED, - IO_APOLL_READY -}; - -static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; - struct async_poll *apoll; - struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR; - int ret; - - if (!def->pollin && !def->pollout) - return IO_APOLL_ABORTED; - if (!file_can_poll(req->file)) - return IO_APOLL_ABORTED; - if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) - return IO_APOLL_ABORTED; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) - mask |= EPOLLONESHOT; - - if (def->pollin) { - mask |= EPOLLIN | EPOLLRDNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if ((req->opcode == IORING_OP_RECVMSG) && - (req->sr_msg.msg_flags & MSG_ERRQUEUE)) - mask &= ~EPOLLIN; - } else { - mask |= EPOLLOUT | EPOLLWRNORM; - } - if (def->poll_exclusive) - mask |= EPOLLEXCLUSIVE; - if (req->flags & REQ_F_POLLED) { - apoll = req->apoll; - } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del_init(&apoll->poll.wait.entry); - } else { - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; - } - apoll->double_poll = NULL; - req->apoll = apoll; - req->flags |= REQ_F_POLLED; - ipt.pt._qproc = io_async_queue_proc; - - io_kbuf_recycle(req, issue_flags); - - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); - if (ret || ipt.error) - return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - - trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, - mask, apoll->poll.events); - return IO_APOLL_OK; -} - -/* - * Returns true if we found and killed one or more poll requests - */ -static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, - struct task_struct *tsk, bool cancel_all) -{ - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - int i; - - spin_lock(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) { - hlist_del_init(&req->hash_node); - io_poll_cancel_req(req); - found = true; - } - } - } - spin_unlock(&ctx->completion_lock); - return found; -} - -static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, - struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct hlist_head *list; - struct io_kiocb *req; - - list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; - hlist_for_each_entry(req, list, hash_node) { - if (cd->data != req->cqe.user_data) - continue; - if (poll_only && req->opcode != IORING_OP_POLL_ADD) - continue; - if (cd->flags & IORING_ASYNC_CANCEL_ALL) { - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - } - return req; - } - return NULL; -} - -static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - int i; - - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry(req, list, hash_node) { - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - req->file != cd->file) - continue; - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - return req; - } - } - return NULL; -} - -static bool io_poll_disarm(struct io_kiocb *req) - __must_hold(&ctx->completion_lock) -{ - if (!io_poll_get_ownership(req)) - return false; - io_poll_remove_entries(req); - hash_del(&req->hash_node); - return true; -} - -static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd); - else - req = io_poll_find(ctx, false, cd); - if (!req) - return -ENOENT; - io_poll_cancel_req(req); - return 0; -} - -static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, - unsigned int flags) -{ - u32 events; - - events = READ_ONCE(sqe->poll32_events); -#ifdef __BIG_ENDIAN - events = swahw32(events); -#endif - if (!(flags & IORING_POLL_ADD_MULTI)) - events |= EPOLLONESHOT; - return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); -} - -static int io_poll_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_poll_update *upd = &req->poll_update; - u32 flags; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - flags = READ_ONCE(sqe->len); - if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | - IORING_POLL_ADD_MULTI)) - return -EINVAL; - /* meaningless without update */ - if (flags == IORING_POLL_ADD_MULTI) - return -EINVAL; - - upd->old_user_data = READ_ONCE(sqe->addr); - upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; - upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; - - upd->new_user_data = READ_ONCE(sqe->off); - if (!upd->update_user_data && upd->new_user_data) - return -EINVAL; - if (upd->update_events) - upd->events = io_poll_parse_events(sqe, flags); - else if (sqe->poll32_events) - return -EINVAL; - - return 0; -} - -static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_poll_iocb *poll = &req->poll; - u32 flags; - - if (sqe->buf_index || sqe->off || sqe->addr) - return -EINVAL; - flags = READ_ONCE(sqe->len); - if (flags & ~IORING_POLL_ADD_MULTI) - return -EINVAL; - if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) - return -EINVAL; - - io_req_set_refcount(req); - req->apoll_events = poll->events = io_poll_parse_events(sqe, flags); - return 0; -} - -static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_poll_iocb *poll = &req->poll; - struct io_poll_table ipt; - int ret; - - ipt.pt._qproc = io_poll_queue_proc; - - ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); - ret = ret ?: ipt.error; - if (ret) - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *preq; - int ret2, ret = 0; - bool locked; - - spin_lock(&ctx->completion_lock); - preq = io_poll_find(ctx, true, &cd); - if (!preq || !io_poll_disarm(preq)) { - spin_unlock(&ctx->completion_lock); - ret = preq ? -EALREADY : -ENOENT; - goto out; - } - spin_unlock(&ctx->completion_lock); - - if (req->poll_update.update_events || req->poll_update.update_user_data) { - /* only mask one event flags, keep behavior flags */ - if (req->poll_update.update_events) { - preq->poll.events &= ~0xffff; - preq->poll.events |= req->poll_update.events & 0xffff; - preq->poll.events |= IO_POLL_UNMASK; - } - if (req->poll_update.update_user_data) - preq->cqe.user_data = req->poll_update.new_user_data; - - ret2 = io_poll_add(preq, issue_flags); - /* successfully updated, don't complete poll request */ - if (!ret2) - goto out; - } - - req_set_fail(preq); - preq->cqe.res = -ECANCELED; - locked = !(issue_flags & IO_URING_F_UNLOCKED); - io_req_task_complete(preq, &locked); -out: - if (ret < 0) - req_set_fail(req); - /* complete update request, we're done with it */ - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) -{ - struct io_timeout_data *data = container_of(timer, - struct io_timeout_data, timer); - struct io_kiocb *req = data->req; - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->timeout_lock, flags); - list_del_init(&req->timeout.list); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); - - if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) - req_set_fail(req); - - req->cqe.res = -ETIME; - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); - return HRTIMER_NORESTART; -} - -static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) - __must_hold(&ctx->timeout_lock) -{ - struct io_timeout_data *io; - struct io_kiocb *req; - bool found = false; - - list_for_each_entry(req, &ctx->timeout_list, timeout.list) { - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - cd->data != req->cqe.user_data) - continue; - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - } - found = true; - break; - } - if (!found) - return ERR_PTR(-ENOENT); - - io = req->async_data; - if (hrtimer_try_to_cancel(&io->timer) == -1) - return ERR_PTR(-EALREADY); - list_del_init(&req->timeout.list); - return req; -} - -static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - - spin_lock_irq(&ctx->timeout_lock); - req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); - - if (IS_ERR(req)) - return PTR_ERR(req); - io_req_task_queue_fail(req, -ECANCELED); - return 0; -} - -static clockid_t io_timeout_get_clock(struct io_timeout_data *data) -{ - switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { - case IORING_TIMEOUT_BOOTTIME: - return CLOCK_BOOTTIME; - case IORING_TIMEOUT_REALTIME: - return CLOCK_REALTIME; - default: - /* can't happen, vetted at prep time */ - WARN_ON_ONCE(1); - fallthrough; - case 0: - return CLOCK_MONOTONIC; - } -} - -static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->timeout_lock) -{ - struct io_timeout_data *io; - struct io_kiocb *req; - bool found = false; - - list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { - found = user_data == req->cqe.user_data; - if (found) - break; - } - if (!found) - return -ENOENT; - - io = req->async_data; - if (hrtimer_try_to_cancel(&io->timer) == -1) - return -EALREADY; - hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); - io->timer.function = io_link_timeout_fn; - hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); - return 0; -} - -static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->timeout_lock) -{ - struct io_cancel_data cd = { .data = user_data, }; - struct io_kiocb *req = io_timeout_extract(ctx, &cd); - struct io_timeout_data *data; - - if (IS_ERR(req)) - return PTR_ERR(req); - - req->timeout.off = 0; /* noseq */ - data = req->async_data; - list_add_tail(&req->timeout.list, &ctx->timeout_list); - hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); - return 0; -} - -static int io_timeout_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_timeout_rem *tr = &req->timeout_rem; - - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->buf_index || sqe->len || sqe->splice_fd_in) - return -EINVAL; - - tr->ltimeout = false; - tr->addr = READ_ONCE(sqe->addr); - tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { - if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) - return -EINVAL; - if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) - tr->ltimeout = true; - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) - return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) - return -EFAULT; - if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) - return -EINVAL; - } else if (tr->flags) { - /* timeout removal doesn't support flags */ - return -EINVAL; - } - - return 0; -} - -static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) -{ - return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS - : HRTIMER_MODE_REL; -} - -/* - * Remove or update an existing timeout command - */ -static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_timeout_rem *tr = &req->timeout_rem; - struct io_ring_ctx *ctx = req->ctx; - int ret; - - if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { - struct io_cancel_data cd = { .data = tr->addr, }; - - spin_lock(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, &cd); - spin_unlock(&ctx->completion_lock); - } else { - enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - - spin_lock_irq(&ctx->timeout_lock); - if (tr->ltimeout) - ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); - else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); - } - - if (ret < 0) - req_set_fail(req); - io_req_complete_post(req, ret, 0); - return 0; -} - -static int __io_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - bool is_timeout_link) -{ - struct io_timeout_data *data; - unsigned flags; - u32 off = READ_ONCE(sqe->off); - - if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) - return -EINVAL; - if (off && is_timeout_link) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | - IORING_TIMEOUT_ETIME_SUCCESS)) - return -EINVAL; - /* more than one clock specified is invalid, obviously */ - if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) - return -EINVAL; - - INIT_LIST_HEAD(&req->timeout.list); - req->timeout.off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; - - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (io_alloc_async_data(req)) - return -ENOMEM; - - data = req->async_data; - data->req = req; - data->flags = flags; - - if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) - return -EFAULT; - - if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) - return -EINVAL; - - INIT_LIST_HEAD(&req->timeout.list); - data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); - - if (is_timeout_link) { - struct io_submit_link *link = &req->ctx->submit_state.link; - - if (!link->head) - return -EINVAL; - if (link->last->opcode == IORING_OP_LINK_TIMEOUT) - return -EINVAL; - req->timeout.head = link->last; - link->last->flags |= REQ_F_ARM_LTIMEOUT; - } - return 0; -} - -static int io_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_timeout_prep(req, sqe, false); -} - -static int io_link_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_timeout_prep(req, sqe, true); -} - -static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data = req->async_data; - struct list_head *entry; - u32 tail, off = req->timeout.off; - - spin_lock_irq(&ctx->timeout_lock); - - /* - * sqe->off holds how many events that need to occur for this - * timeout event to be satisfied. If it isn't set, then this is - * a pure timeout request, sequence isn't used. - */ - if (io_is_timeout_noseq(req)) { - entry = ctx->timeout_list.prev; - goto add; - } - - tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - req->timeout.target_seq = tail + off; - - /* Update the last seq here in case io_flush_timeouts() hasn't. - * This is safe because ->completion_lock is held, and submissions - * and completions are never mixed in the same ->completion_lock section. - */ - ctx->cq_last_tm_flush = tail; - - /* - * Insertion sort, ensuring the first entry in the list is always - * the one we need first. - */ - list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, - timeout.list); - - if (io_is_timeout_noseq(nxt)) - continue; - /* nxt.seq is behind @tail, otherwise would've been completed */ - if (off >= nxt->timeout.target_seq - tail) - break; - } -add: - list_add(&req->timeout.list, entry); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); - return 0; -} - -static bool io_cancel_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_cancel_data *cd = data; - - if (req->ctx != cd->ctx) - return false; - if (cd->flags & IORING_ASYNC_CANCEL_ANY) { - ; - } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { - if (req->file != cd->file) - return false; - } else { - if (req->cqe.user_data != cd->data) - return false; - } - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) - return false; - req->work.cancel_seq = cd->seq; - } - return true; -} - -static int io_async_cancel_one(struct io_uring_task *tctx, - struct io_cancel_data *cd) -{ - enum io_wq_cancel cancel_ret; - int ret = 0; - bool all; - - if (!tctx || !tctx->io_wq) - return -ENOENT; - - all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); - switch (cancel_ret) { - case IO_WQ_CANCEL_OK: - ret = 0; - break; - case IO_WQ_CANCEL_RUNNING: - ret = -EALREADY; - break; - case IO_WQ_CANCEL_NOTFOUND: - ret = -ENOENT; - break; - } - - return ret; -} - -static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); - - ret = io_async_cancel_one(req->task->io_uring, cd); - /* - * Fall-through even for -EALREADY, as we may have poll armed - * that need unarming. - */ - if (!ret) - return 0; - - spin_lock(&ctx->completion_lock); - ret = io_poll_cancel(ctx, cd); - if (ret != -ENOENT) - goto out; - if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) - ret = io_timeout_cancel(ctx, cd); -out: - spin_unlock(&ctx->completion_lock); - return ret; -} - -#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ - IORING_ASYNC_CANCEL_ANY) - -static int io_async_cancel_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; - if (sqe->off || sqe->len || sqe->splice_fd_in) - return -EINVAL; - - req->cancel.addr = READ_ONCE(sqe->addr); - req->cancel.flags = READ_ONCE(sqe->cancel_flags); - if (req->cancel.flags & ~CANCEL_FLAGS) - return -EINVAL; - if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { - if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) - return -EINVAL; - req->cancel.fd = READ_ONCE(sqe->fd); - } - - return 0; -} - -static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, - unsigned int issue_flags) -{ - bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); - struct io_ring_ctx *ctx = cd->ctx; - struct io_tctx_node *node; - int ret, nr = 0; - - do { - ret = io_try_cancel(req, cd); - if (ret == -ENOENT) - break; - if (!all) - return ret; - nr++; - } while (1); - - /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, issue_flags); - ret = -ENOENT; - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); - if (ret != -ENOENT) { - if (!all) - break; - nr++; - } - } - io_ring_submit_unlock(ctx, issue_flags); - return all ? nr : ret; -} - -static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_cancel_data cd = { - .ctx = req->ctx, - .data = req->cancel.addr, - .flags = req->cancel.flags, - .seq = atomic_inc_return(&req->ctx->cancel_seq), - }; - int ret; - - if (cd.flags & IORING_ASYNC_CANCEL_FD) { - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->cancel.fd, - issue_flags); - else - req->file = io_file_get_normal(req, req->cancel.fd); - if (!req->file) { - ret = -EBADF; - goto done; - } - cd.file = req->file; - } - - ret = __io_async_cancel(&cd, req, issue_flags); -done: - if (ret < 0) - req_set_fail(req); - io_req_complete_post(req, ret, 0); - return 0; -} - -static int io_files_update_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - req->rsrc_update.offset = READ_ONCE(sqe->off); - req->rsrc_update.nr_args = READ_ONCE(sqe->len); - if (!req->rsrc_update.nr_args) - return -EINVAL; - req->rsrc_update.arg = READ_ONCE(sqe->addr); - return 0; -} - -static int io_files_update_with_index_alloc(struct io_kiocb *req, - unsigned int issue_flags) -{ - __s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg); - unsigned int done; - struct file *file; - int ret, fd; - - for (done = 0; done < req->rsrc_update.nr_args; done++) { - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { - ret = -EFAULT; - break; - } - - file = fget(fd); - if (!file) { - ret = -EBADF; - break; - } - ret = io_fixed_fd_install(req, issue_flags, file, - IORING_FILE_INDEX_ALLOC); - if (ret < 0) - break; - if (copy_to_user(&fds[done], &ret, sizeof(ret))) { - ret = -EFAULT; - __io_close_fixed(req, issue_flags, ret); - break; - } - } - - if (done) - return done; - return ret; -} - -static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_uring_rsrc_update2 up; - int ret; - - up.offset = req->rsrc_update.offset; - up.data = req->rsrc_update.arg; - up.nr = 0; - up.tags = 0; - up.resv = 0; - up.resv2 = 0; - - if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) { - ret = io_files_update_with_index_alloc(req, issue_flags); - } else { - io_ring_submit_lock(ctx, issue_flags); - ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, - &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, issue_flags); - } - - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - switch (req->opcode) { - case IORING_OP_NOP: - return io_nop_prep(req, sqe); - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - return io_prep_rw(req, sqe); - case IORING_OP_POLL_ADD: - return io_poll_add_prep(req, sqe); - case IORING_OP_POLL_REMOVE: - return io_poll_remove_prep(req, sqe); - case IORING_OP_FSYNC: - return io_fsync_prep(req, sqe); - case IORING_OP_SYNC_FILE_RANGE: - return io_sfr_prep(req, sqe); - case IORING_OP_SENDMSG: - case IORING_OP_SEND: - return io_sendmsg_prep(req, sqe); - case IORING_OP_RECVMSG: - case IORING_OP_RECV: - return io_recvmsg_prep(req, sqe); - case IORING_OP_CONNECT: - return io_connect_prep(req, sqe); - case IORING_OP_TIMEOUT: - return io_timeout_prep(req, sqe); - case IORING_OP_TIMEOUT_REMOVE: - return io_timeout_remove_prep(req, sqe); - case IORING_OP_ASYNC_CANCEL: - return io_async_cancel_prep(req, sqe); - case IORING_OP_LINK_TIMEOUT: - return io_link_timeout_prep(req, sqe); - case IORING_OP_ACCEPT: - return io_accept_prep(req, sqe); - case IORING_OP_FALLOCATE: - return io_fallocate_prep(req, sqe); - case IORING_OP_OPENAT: - return io_openat_prep(req, sqe); - case IORING_OP_CLOSE: - return io_close_prep(req, sqe); - case IORING_OP_FILES_UPDATE: - return io_files_update_prep(req, sqe); - case IORING_OP_STATX: - return io_statx_prep(req, sqe); - case IORING_OP_FADVISE: - return io_fadvise_prep(req, sqe); - case IORING_OP_MADVISE: - return io_madvise_prep(req, sqe); - case IORING_OP_OPENAT2: - return io_openat2_prep(req, sqe); - case IORING_OP_EPOLL_CTL: - return io_epoll_ctl_prep(req, sqe); - case IORING_OP_SPLICE: - return io_splice_prep(req, sqe); - case IORING_OP_PROVIDE_BUFFERS: - return io_provide_buffers_prep(req, sqe); - case IORING_OP_REMOVE_BUFFERS: - return io_remove_buffers_prep(req, sqe); - case IORING_OP_TEE: - return io_tee_prep(req, sqe); - case IORING_OP_SHUTDOWN: - return io_shutdown_prep(req, sqe); - case IORING_OP_RENAMEAT: - return io_renameat_prep(req, sqe); - case IORING_OP_UNLINKAT: - return io_unlinkat_prep(req, sqe); - case IORING_OP_MKDIRAT: - return io_mkdirat_prep(req, sqe); - case IORING_OP_SYMLINKAT: - return io_symlinkat_prep(req, sqe); - case IORING_OP_LINKAT: - return io_linkat_prep(req, sqe); - case IORING_OP_MSG_RING: - return io_msg_ring_prep(req, sqe); - case IORING_OP_FSETXATTR: - return io_fsetxattr_prep(req, sqe); - case IORING_OP_SETXATTR: - return io_setxattr_prep(req, sqe); - case IORING_OP_FGETXATTR: - return io_fgetxattr_prep(req, sqe); - case IORING_OP_GETXATTR: - return io_getxattr_prep(req, sqe); - case IORING_OP_SOCKET: - return io_socket_prep(req, sqe); - case IORING_OP_URING_CMD: - return io_uring_cmd_prep(req, sqe); - } - - printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", - req->opcode); - return -EINVAL; -} - -static int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->needs_async_setup) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (io_alloc_async_data(req)) - return -EAGAIN; - - switch (req->opcode) { - case IORING_OP_READV: - return io_readv_prep_async(req); - case IORING_OP_WRITEV: - return io_writev_prep_async(req); - case IORING_OP_SENDMSG: - return io_sendmsg_prep_async(req); - case IORING_OP_RECVMSG: - return io_recvmsg_prep_async(req); - case IORING_OP_CONNECT: - return io_connect_prep_async(req); - case IORING_OP_URING_CMD: - return io_uring_cmd_prep_async(req); - } - printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", - req->opcode); - return -EFAULT; -} - -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - -static __cold void io_drain_req(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_defer_entry *de; - int ret; - u32 seq = io_get_sequence(req); - - /* Still need defer if there is pending req in defer list. */ - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); -queue: - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - spin_unlock(&ctx->completion_lock); - - ret = io_req_prep_async(req); - if (ret) { -fail: - io_req_complete_failed(req, ret); - return; - } - io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); - if (!de) { - ret = -ENOMEM; - goto fail; - } - - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - goto queue; - } - - trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); - de->req = req; - de->seq = seq; - list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); -} - -static void io_clean_op(struct io_kiocb *req) -{ - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); - spin_unlock(&req->ctx->completion_lock); - } - - if (req->flags & REQ_F_NEED_CLEANUP) { - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: { - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); - break; - } - case IORING_OP_RECVMSG: - case IORING_OP_SENDMSG: { - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); - break; - } - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - if (req->open.filename) - putname(req->open.filename); - break; - case IORING_OP_RENAMEAT: - putname(req->rename.oldpath); - putname(req->rename.newpath); - break; - case IORING_OP_UNLINKAT: - putname(req->unlink.filename); - break; - case IORING_OP_MKDIRAT: - putname(req->mkdir.filename); - break; - case IORING_OP_SYMLINKAT: - putname(req->symlink.oldpath); - putname(req->symlink.newpath); - break; - case IORING_OP_LINKAT: - putname(req->hardlink.oldpath); - putname(req->hardlink.newpath); - break; - case IORING_OP_STATX: - if (req->statx.filename) - putname(req->statx.filename); - break; - case IORING_OP_SETXATTR: - case IORING_OP_FSETXATTR: - case IORING_OP_GETXATTR: - case IORING_OP_FGETXATTR: - __io_xattr_finish(req); - break; - } - } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } - if (req->flags & REQ_F_CREDS) - put_cred(req->creds); - if (req->flags & REQ_F_ASYNC_DATA) { - kfree(req->async_data); - req->async_data = NULL; - } - req->flags &= ~IO_REQ_CLEAN_FLAGS; -} - -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) -{ - if (req->file || !io_op_defs[req->opcode].needs_file) - return true; - - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); - else - req->file = io_file_get_normal(req, req->cqe.fd); - - return !!req->file; -} - -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - const struct cred *creds = NULL; - int ret; - - if (unlikely(!io_assign_file(req, issue_flags))) - return -EBADF; - - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); - - if (!def->audit_skip) - audit_uring_entry(req->opcode); - - switch (req->opcode) { - case IORING_OP_NOP: - ret = io_nop(req, issue_flags); - break; - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - ret = io_read(req, issue_flags); - break; - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - ret = io_write(req, issue_flags); - break; - case IORING_OP_FSYNC: - ret = io_fsync(req, issue_flags); - break; - case IORING_OP_POLL_ADD: - ret = io_poll_add(req, issue_flags); - break; - case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, issue_flags); - break; - case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, issue_flags); - break; - case IORING_OP_SENDMSG: - ret = io_sendmsg(req, issue_flags); - break; - case IORING_OP_SEND: - ret = io_send(req, issue_flags); - break; - case IORING_OP_RECVMSG: - ret = io_recvmsg(req, issue_flags); - break; - case IORING_OP_RECV: - ret = io_recv(req, issue_flags); - break; - case IORING_OP_TIMEOUT: - ret = io_timeout(req, issue_flags); - break; - case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, issue_flags); - break; - case IORING_OP_ACCEPT: - ret = io_accept(req, issue_flags); - break; - case IORING_OP_CONNECT: - ret = io_connect(req, issue_flags); - break; - case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, issue_flags); - break; - case IORING_OP_FALLOCATE: - ret = io_fallocate(req, issue_flags); - break; - case IORING_OP_OPENAT: - ret = io_openat(req, issue_flags); - break; - case IORING_OP_CLOSE: - ret = io_close(req, issue_flags); - break; - case IORING_OP_FILES_UPDATE: - ret = io_files_update(req, issue_flags); - break; - case IORING_OP_STATX: - ret = io_statx(req, issue_flags); - break; - case IORING_OP_FADVISE: - ret = io_fadvise(req, issue_flags); - break; - case IORING_OP_MADVISE: - ret = io_madvise(req, issue_flags); - break; - case IORING_OP_OPENAT2: - ret = io_openat2(req, issue_flags); - break; - case IORING_OP_EPOLL_CTL: - ret = io_epoll_ctl(req, issue_flags); - break; - case IORING_OP_SPLICE: - ret = io_splice(req, issue_flags); - break; - case IORING_OP_PROVIDE_BUFFERS: - ret = io_provide_buffers(req, issue_flags); - break; - case IORING_OP_REMOVE_BUFFERS: - ret = io_remove_buffers(req, issue_flags); - break; - case IORING_OP_TEE: - ret = io_tee(req, issue_flags); - break; - case IORING_OP_SHUTDOWN: - ret = io_shutdown(req, issue_flags); - break; - case IORING_OP_RENAMEAT: - ret = io_renameat(req, issue_flags); - break; - case IORING_OP_UNLINKAT: - ret = io_unlinkat(req, issue_flags); - break; - case IORING_OP_MKDIRAT: - ret = io_mkdirat(req, issue_flags); - break; - case IORING_OP_SYMLINKAT: - ret = io_symlinkat(req, issue_flags); - break; - case IORING_OP_LINKAT: - ret = io_linkat(req, issue_flags); - break; - case IORING_OP_MSG_RING: - ret = io_msg_ring(req, issue_flags); - break; - case IORING_OP_FSETXATTR: - ret = io_fsetxattr(req, issue_flags); - break; - case IORING_OP_SETXATTR: - ret = io_setxattr(req, issue_flags); - break; - case IORING_OP_FGETXATTR: - ret = io_fgetxattr(req, issue_flags); - break; - case IORING_OP_GETXATTR: - ret = io_getxattr(req, issue_flags); - break; - case IORING_OP_SOCKET: - ret = io_socket(req, issue_flags); - break; - case IORING_OP_URING_CMD: - ret = io_uring_cmd(req, issue_flags); - break; - default: - ret = -EINVAL; - break; - } - - if (!def->audit_skip) - audit_uring_exit(!ret, ret); - - if (creds) - revert_creds(creds); - if (ret) - return ret; - /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) - io_iopoll_req_issued(req, issue_flags); - - return 0; -} - -static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - req = io_put_req_find_next(req); - return req ? &req->work : NULL; -} - -static void io_wq_submit_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - const struct io_op_def *def = &io_op_defs[req->opcode]; - unsigned int issue_flags = IO_URING_F_UNLOCKED; - bool needs_poll = false; - int ret = 0, err = -ECANCELED; - - /* one will be dropped by ->io_free_work() after returning to io-wq */ - if (!(req->flags & REQ_F_REFCOUNT)) - __io_req_set_refcount(req, 2); - else - req_ref_get(req); - - io_arm_ltimeout(req); - - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { -fail: - io_req_task_queue_fail(req, err); - return; - } - if (!io_assign_file(req, issue_flags)) { - err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; - goto fail; - } - - if (req->flags & REQ_F_FORCE_ASYNC) { - bool opcode_poll = def->pollin || def->pollout; - - if (opcode_poll && file_can_poll(req->file)) { - needs_poll = true; - issue_flags |= IO_URING_F_NONBLOCK; - } - } - - do { - ret = io_issue_sqe(req, issue_flags); - if (ret != -EAGAIN) - break; - /* - * We can get EAGAIN for iopolled IO even though we're - * forcing a sync submission from here, since we can't - * wait for request slots on the block side. - */ - if (!needs_poll) { - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) - break; - cond_resched(); - continue; - } - - if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) - return; - /* aborted or ready, in either case retry blocking */ - needs_poll = false; - issue_flags &= ~IO_URING_F_NONBLOCK; - } while (1); - - /* avoid locking problems by failing it from a clean context */ - if (ret) - io_req_task_queue_fail(req, ret); -} - -static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, - unsigned i) -{ - return &table->files[i]; -} - -static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, - int index) -{ - struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); - - return (struct file *) (slot->file_ptr & FFS_MASK); -} - -static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) -{ - unsigned long file_ptr = (unsigned long) file; - - file_ptr |= io_file_get_flags(file); - file_slot->file_ptr = file_ptr; -} - -static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct file *file = NULL; - unsigned long file_ptr; - - io_ring_submit_lock(ctx, issue_flags); - - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - goto out; - fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); - io_req_set_rsrc_node(req, ctx, 0); - WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); -out: - io_ring_submit_unlock(ctx, issue_flags); - return file; -} - -static struct file *io_file_get_normal(struct io_kiocb *req, int fd) -{ - struct file *file = fget(fd); - - trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); - - /* we don't allow fixed io_uring files */ - if (file && file->f_op == &io_uring_fops) - io_req_track_inflight(req); - return file; -} - -static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) -{ - struct io_kiocb *prev = req->timeout.prev; - int ret = -ENOENT; - - if (prev) { - if (!(req->task->flags & PF_EXITING)) { - struct io_cancel_data cd = { - .ctx = req->ctx, - .data = prev->cqe.user_data, - }; - - ret = io_try_cancel(req, &cd); - } - io_req_complete_post(req, ret ?: -ETIME, 0); - io_put_req(prev); - } else { - io_req_complete_post(req, -ETIME, 0); - } -} - -static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) -{ - struct io_timeout_data *data = container_of(timer, - struct io_timeout_data, timer); - struct io_kiocb *prev, *req = data->req; - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->timeout_lock, flags); - prev = req->timeout.head; - req->timeout.head = NULL; - - /* - * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. - */ - if (prev) { - io_remove_next_linked(prev); - if (!req_ref_inc_not_zero(prev)) - prev = NULL; - } - list_del(&req->timeout.list); - req->timeout.prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); - - req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req); - return HRTIMER_NORESTART; -} - -static void io_queue_linked_timeout(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - /* - * If the back reference is NULL, then our linked request finished - * before we got a chance to setup the timer - */ - if (req->timeout.head) { - struct io_timeout_data *data = req->async_data; - - data->timer.function = io_link_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), - data->mode); - list_add_tail(&req->timeout.list, &ctx->ltimeout_list); - } - spin_unlock_irq(&ctx->timeout_lock); - /* drop submission reference */ - io_put_req(req); -} - -static void io_queue_async(struct io_kiocb *req, int ret) - __must_hold(&req->ctx->uring_lock) -{ - struct io_kiocb *linked_timeout; - - if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { - io_req_complete_failed(req, ret); - return; - } - - linked_timeout = io_prep_linked_timeout(req); - - switch (io_arm_poll_handler(req, 0)) { - case IO_APOLL_READY: - io_req_task_queue(req); - break; - case IO_APOLL_ABORTED: - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_iowq(req, NULL); - break; - case IO_APOLL_OK: - break; - } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); -} - -static inline void io_queue_sqe(struct io_kiocb *req) - __must_hold(&req->ctx->uring_lock) -{ - int ret; - - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - - if (req->flags & REQ_F_COMPLETE_INLINE) { - io_req_add_compl_list(req); - return; - } - /* - * We async punt it if the file wasn't marked NOWAIT, or if the file - * doesn't support non-blocking read/write attempts - */ - if (likely(!ret)) - io_arm_ltimeout(req); - else - io_queue_async(req, ret); -} - -static void io_queue_sqe_fallback(struct io_kiocb *req) - __must_hold(&req->ctx->uring_lock) -{ - if (unlikely(req->flags & REQ_F_FAIL)) { - /* - * We don't submit, fail them all, for that replace hardlinks - * with normal links. Extra REQ_F_LINK is tolerated. - */ - req->flags &= ~REQ_F_HARDLINK; - req->flags |= REQ_F_LINK; - io_req_complete_failed(req, req->cqe.res); - } else if (unlikely(req->ctx->drain_active)) { - io_drain_req(req); - } else { - int ret = io_req_prep_async(req); - - if (unlikely(ret)) - io_req_complete_failed(req, ret); - else - io_queue_iowq(req, NULL); - } -} - -/* - * Check SQE restrictions (opcode and flags). - * - * Returns 'true' if SQE is allowed, 'false' otherwise. - */ -static inline bool io_check_restriction(struct io_ring_ctx *ctx, - struct io_kiocb *req, - unsigned int sqe_flags) -{ - if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) - return false; - - if ((sqe_flags & ctx->restrictions.sqe_flags_required) != - ctx->restrictions.sqe_flags_required) - return false; - - if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | - ctx->restrictions.sqe_flags_required)) - return false; - - return true; -} - -static void io_init_req_drain(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *head = ctx->submit_state.link.head; - - ctx->drain_active = true; - if (head) { - /* - * If we need to drain a request in the middle of a link, drain - * the head request and the next request/link after the current - * link. Considering sequential execution of links, - * REQ_F_IO_DRAIN will be maintained for every request of our - * link. - */ - head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - ctx->drain_next = true; - } -} - -static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) - __must_hold(&ctx->uring_lock) -{ - const struct io_op_def *def; - unsigned int sqe_flags; - int personality; - u8 opcode; - - /* req is partially pre-initialised, see io_preinit_req() */ - req->opcode = opcode = READ_ONCE(sqe->opcode); - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); - req->cqe.user_data = READ_ONCE(sqe->user_data); - req->file = NULL; - req->rsrc_node = NULL; - req->task = current; - - if (unlikely(opcode >= IORING_OP_LAST)) { - req->opcode = 0; - return -EINVAL; - } - def = &io_op_defs[opcode]; - if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { - /* enforce forwards compatibility on users */ - if (sqe_flags & ~SQE_VALID_FLAGS) - return -EINVAL; - if (sqe_flags & IOSQE_BUFFER_SELECT) { - if (!def->buffer_select) - return -EOPNOTSUPP; - req->buf_index = READ_ONCE(sqe->buf_group); - } - if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) - ctx->drain_disabled = true; - if (sqe_flags & IOSQE_IO_DRAIN) { - if (ctx->drain_disabled) - return -EOPNOTSUPP; - io_init_req_drain(req); - } - } - if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { - if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) - return -EACCES; - /* knock it to the slow queue path, will be drained there */ - if (ctx->drain_active) - req->flags |= REQ_F_FORCE_ASYNC; - /* if there is no link, we're at "next" request and need to drain */ - if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { - ctx->drain_next = false; - ctx->drain_active = true; - req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - } - } - - if (!def->ioprio && sqe->ioprio) - return -EINVAL; - if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - - if (def->needs_file) { - struct io_submit_state *state = &ctx->submit_state; - - req->cqe.fd = READ_ONCE(sqe->fd); - - /* - * Plug now if we have more than 2 IO left after this, and the - * target is potentially a read/write to block based storage. - */ - if (state->need_plug && def->plug) { - state->plug_started = true; - state->need_plug = false; - blk_start_plug_nr_ios(&state->plug, state->submit_nr); - } - } - - personality = READ_ONCE(sqe->personality); - if (personality) { - int ret; - - req->creds = xa_load(&ctx->personalities, personality); - if (!req->creds) - return -EINVAL; - get_cred(req->creds); - ret = security_uring_override_creds(req->creds); - if (ret) { - put_cred(req->creds); - return ret; - } - req->flags |= REQ_F_CREDS; - } - - return io_req_prep(req, sqe); -} - -static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, - struct io_kiocb *req, int ret) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_link *link = &ctx->submit_state.link; - struct io_kiocb *head = link->head; - - trace_io_uring_req_failed(sqe, ctx, req, ret); - - /* - * Avoid breaking links in the middle as it renders links with SQPOLL - * unusable. Instead of failing eagerly, continue assembling the link if - * applicable and mark the head with REQ_F_FAIL. The link flushing code - * should find the flag and handle the rest. - */ - req_fail_link_node(req, ret); - if (head && !(head->flags & REQ_F_FAIL)) - req_fail_link_node(head, -ECANCELED); - - if (!(req->flags & IO_REQ_LINK_FLAGS)) { - if (head) { - link->last->link = req; - link->head = NULL; - req = head; - } - io_queue_sqe_fallback(req); - return ret; - } - - if (head) - link->last->link = req; - else - link->head = req; - link->last = req; - return 0; -} - -static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) - __must_hold(&ctx->uring_lock) -{ - struct io_submit_link *link = &ctx->submit_state.link; - int ret; - - ret = io_init_req(ctx, req, sqe); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - - /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, - req->flags, true, - ctx->flags & IORING_SETUP_SQPOLL); - - /* - * If we already have a head request, queue this one for async - * submittal once the head completes. If we don't have a head but - * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be - * submitted sync once the chain is complete. If none of those - * conditions are true (normal request), then just queue it. - */ - if (unlikely(link->head)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - - trace_io_uring_link(ctx, req, link->head); - link->last->link = req; - link->last = req; - - if (req->flags & IO_REQ_LINK_FLAGS) - return 0; - /* last request of the link, flush it */ - req = link->head; - link->head = NULL; - if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) - goto fallback; - - } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | - REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { - if (req->flags & IO_REQ_LINK_FLAGS) { - link->head = req; - link->last = req; - } else { -fallback: - io_queue_sqe_fallback(req); - } - return 0; - } - - io_queue_sqe(req); - return 0; -} - -/* - * Batched submission is done, ensure local IO is flushed out. - */ -static void io_submit_state_end(struct io_ring_ctx *ctx) -{ - struct io_submit_state *state = &ctx->submit_state; - - if (unlikely(state->link.head)) - io_queue_sqe_fallback(state->link.head); - /* flush only after queuing links as they can generate completions */ - io_submit_flush_completions(ctx); - if (state->plug_started) - blk_finish_plug(&state->plug); -} - -/* - * Start submission side cache. - */ -static void io_submit_state_start(struct io_submit_state *state, - unsigned int max_ios) -{ - state->plug_started = false; - state->need_plug = max_ios > 2; - state->submit_nr = max_ios; - /* set only head, no need to init link_last in advance */ - state->link.head = NULL; -} - -static void io_commit_sqring(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); -} - -/* - * Fetch an sqe, if one is available. Note this returns a pointer to memory - * that is mapped by userspace. This means that care needs to be taken to - * ensure that reads are stable, as we cannot rely on userspace always - * being a good citizen. If members of the sqe are validated and then later - * used, it's important that those reads are done through READ_ONCE() to - * prevent a re-load down the line. - */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) -{ - unsigned head, mask = ctx->sq_entries - 1; - unsigned sq_idx = ctx->cached_sq_head++ & mask; - - /* - * The cached sq head (or cq tail) serves two purposes: - * - * 1) allows us to batch the cost of updating the user visible - * head updates. - * 2) allows the kernel side to track the head on its own, even - * though the application is the one updating it. - */ - head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) { - /* double index for 128-byte SQEs, twice as long */ - if (ctx->flags & IORING_SETUP_SQE128) - head <<= 1; - return &ctx->sq_sqes[head]; - } - - /* drop invalid entries */ - ctx->cq_extra--; - WRITE_ONCE(ctx->rings->sq_dropped, - READ_ONCE(ctx->rings->sq_dropped) + 1); - return NULL; -} - -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) - __must_hold(&ctx->uring_lock) -{ - unsigned int entries = io_sqring_entries(ctx); - unsigned int left; - int ret; - - if (unlikely(!entries)) - return 0; - /* make sure SQ entry isn't read before tail */ - ret = left = min3(nr, ctx->sq_entries, entries); - io_get_task_refs(left); - io_submit_state_start(&ctx->submit_state, left); - - do { - const struct io_uring_sqe *sqe; - struct io_kiocb *req; - - if (unlikely(!io_alloc_req_refill(ctx))) - break; - req = io_alloc_req(ctx); - sqe = io_get_sqe(ctx); - if (unlikely(!sqe)) { - io_req_add_to_cache(req, ctx); - break; - } - - /* - * Continue submitting even for sqe failure if the - * ring was setup with IORING_SETUP_SUBMIT_ALL - */ - if (unlikely(io_submit_sqe(ctx, req, sqe)) && - !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { - left--; - break; - } - } while (--left); - - if (unlikely(left)) { - ret -= left; - /* try again if it submitted nothing and can't allocate a req */ - if (!ret && io_req_cache_empty(ctx)) - ret = -EAGAIN; - current->io_uring->cached_refs += left; - } - - io_submit_state_end(ctx); - /* Commit SQ ring head once we've consumed and submitted all SQEs */ - io_commit_sqring(ctx); - return ret; -} - -static inline bool io_sqd_events_pending(struct io_sq_data *sqd) -{ - return READ_ONCE(sqd->state); -} - -static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) -{ - unsigned int to_submit; - int ret = 0; - - to_submit = io_sqring_entries(ctx); - /* if we're handling multiple rings, cap submit size for fairness */ - if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) - to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { - const struct cred *creds = NULL; - - if (ctx->sq_creds != current_cred()) - creds = override_creds(ctx->sq_creds); - - mutex_lock(&ctx->uring_lock); - if (!wq_list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, true); - - /* - * Don't submit if refs are dying, good for io_uring_register(), - * but also it is relied upon by io_ring_exit_work() - */ - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && - !(ctx->flags & IORING_SETUP_R_DISABLED)) - ret = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); - - if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) - wake_up(&ctx->sqo_sq_wait); - if (creds) - revert_creds(creds); - } - - return ret; -} - -static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) -{ - struct io_ring_ctx *ctx; - unsigned sq_thread_idle = 0; - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); - sqd->sq_thread_idle = sq_thread_idle; -} - -static bool io_sqd_handle_event(struct io_sq_data *sqd) -{ - bool did_sig = false; - struct ksignal ksig; - - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || - signal_pending(current)) { - mutex_unlock(&sqd->lock); - if (signal_pending(current)) - did_sig = get_signal(&ksig); - cond_resched(); - mutex_lock(&sqd->lock); - } - return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); -} - -static int io_sq_thread(void *data) -{ - struct io_sq_data *sqd = data; - struct io_ring_ctx *ctx; - unsigned long timeout = 0; - char buf[TASK_COMM_LEN]; - DEFINE_WAIT(wait); - - snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); - set_task_comm(current, buf); - - if (sqd->sq_cpu != -1) - set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); - else - set_cpus_allowed_ptr(current, cpu_online_mask); - current->flags |= PF_NO_SETAFFINITY; - - audit_alloc_kernel(current); - - mutex_lock(&sqd->lock); - while (1) { - bool cap_entries, sqt_spin = false; - - if (io_sqd_events_pending(sqd) || signal_pending(current)) { - if (io_sqd_handle_event(sqd)) - break; - timeout = jiffies + sqd->sq_thread_idle; - } - - cap_entries = !list_is_singular(&sqd->ctx_list); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - int ret = __io_sq_thread(ctx, cap_entries); - - if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) - sqt_spin = true; - } - if (io_run_task_work()) - sqt_spin = true; - - if (sqt_spin || !time_after(jiffies, timeout)) { - cond_resched(); - if (sqt_spin) - timeout = jiffies + sqd->sq_thread_idle; - continue; - } - - prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { - bool needs_sched = true; - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - atomic_or(IORING_SQ_NEED_WAKEUP, - &ctx->rings->sq_flags); - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !wq_list_empty(&ctx->iopoll_list)) { - needs_sched = false; - break; - } - - /* - * Ensure the store of the wakeup flag is not - * reordered with the load of the SQ tail - */ - smp_mb__after_atomic(); - - if (io_sqring_entries(ctx)) { - needs_sched = false; - break; - } - } - - if (needs_sched) { - mutex_unlock(&sqd->lock); - schedule(); - mutex_lock(&sqd->lock); - } - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - atomic_andnot(IORING_SQ_NEED_WAKEUP, - &ctx->rings->sq_flags); - } - - finish_wait(&sqd->wait, &wait); - timeout = jiffies + sqd->sq_thread_idle; - } - - io_uring_cancel_generic(true, sqd); - sqd->thread = NULL; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); - io_run_task_work(); - mutex_unlock(&sqd->lock); - - audit_free(current); - - complete(&sqd->exited); - do_exit(0); -} - -struct io_wait_queue { - struct wait_queue_entry wq; - struct io_ring_ctx *ctx; - unsigned cq_tail; - unsigned nr_timeouts; -}; - -static inline bool io_should_wake(struct io_wait_queue *iowq) -{ - struct io_ring_ctx *ctx = iowq->ctx; - int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; - - /* - * Wake up if we have enough events, or if a timeout occurred since we - * started waiting. For timeouts, we always want to return to userspace, - * regardless of event count. - */ - return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; -} - -static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, - int wake_flags, void *key) -{ - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, - wq); - - /* - * Cannot safely flush overflowed CQEs from here, ensure we wake up - * the task, and the next invocation will do it. - */ - if (io_should_wake(iowq) || - test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq)) - return autoremove_wake_function(curr, mode, wake_flags, key); - return -1; -} - -static int io_run_task_work_sig(void) -{ - if (io_run_task_work()) - return 1; - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) - return -ERESTARTSYS; - if (task_sigpending(current)) - return -EINTR; - return 0; -} - -/* when returns >0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t timeout) -{ - int ret; - unsigned long check_cq; - - /* make sure we run task_work before checking for signals */ - ret = io_run_task_work_sig(); - if (ret || io_should_wake(iowq)) - return ret; - check_cq = READ_ONCE(ctx->check_cq); - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) - return -EBADR; - if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) - return -ETIME; - return 1; -} - -/* - * Wait until events become available, if we don't already have some. The - * application must reap them itself, as they reside on the shared cq ring. - */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz, - struct __kernel_timespec __user *uts) -{ - struct io_wait_queue iowq; - struct io_rings *rings = ctx->rings; - ktime_t timeout = KTIME_MAX; - int ret; - - do { - io_cqring_overflow_flush(ctx); - if (io_cqring_events(ctx) >= min_events) - return 0; - if (!io_run_task_work()) - break; - } while (1); - - if (sig) { -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); - else -#endif - ret = set_user_sigmask(sig, sigsz); - - if (ret) - return ret; - } - - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - } - - init_waitqueue_func_entry(&iowq.wq, io_wake_function); - iowq.wq.private = current; - INIT_LIST_HEAD(&iowq.wq.entry); - iowq.ctx = ctx; - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; - - trace_io_uring_cqring_wait(ctx, min_events); - do { - /* if we can't even flush overflow, don't wait for more */ - if (!io_cqring_overflow_flush(ctx)) { - ret = -EBUSY; - break; - } - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, - TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, timeout); - cond_resched(); - } while (ret > 0); - - finish_wait(&ctx->cq_wait, &iowq.wq); - restore_saved_sigmask_unless(ret == -EINTR); - - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; -} - -static void io_free_page_table(void **table, size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - - for (i = 0; i < nr_tables; i++) - kfree(table[i]); - kfree(table); -} - -static __cold void **io_alloc_page_table(size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - size_t init_size = size; - void **table; - - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); - if (!table) - return NULL; - - for (i = 0; i < nr_tables; i++) { - unsigned int this_size = min_t(size_t, size, PAGE_SIZE); - - table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); - if (!table[i]) { - io_free_page_table(table, init_size); - return NULL; - } - size -= this_size; - } - return table; -} - -static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) -{ - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); -} - -static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) -{ - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - unsigned long delay = HZ; - - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); - node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; - - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (!node->done) - break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); -} - -static struct io_rsrc_node *io_rsrc_node_alloc(void) -{ - struct io_rsrc_node *ref_node; - - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; - return ref_node; -} - -static void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) - __must_hold(&ctx->uring_lock) -{ - WARN_ON_ONCE(!ctx->rsrc_backup_node); - WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); - - io_rsrc_refs_drop(ctx); - - if (data_to_kill) { - struct io_rsrc_node *rsrc_node = ctx->rsrc_node; - - rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); - list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); - - atomic_inc(&data_to_kill->refs); - percpu_ref_kill(&rsrc_node->refs); - ctx->rsrc_node = NULL; - } - - if (!ctx->rsrc_node) { - ctx->rsrc_node = ctx->rsrc_backup_node; - ctx->rsrc_backup_node = NULL; - } -} - -static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) -{ - if (ctx->rsrc_backup_node) - return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); - return ctx->rsrc_backup_node ? 0 : -ENOMEM; -} - -static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, - struct io_ring_ctx *ctx) -{ - int ret; - - /* As we may drop ->uring_lock, other task may have started quiesce */ - if (data->quiesce) - return -ENXIO; - - data->quiesce = true; - do { - ret = io_rsrc_node_switch_start(ctx); - if (ret) - break; - io_rsrc_node_switch(ctx, data); - - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) - break; - mutex_unlock(&ctx->uring_lock); - flush_delayed_work(&ctx->rsrc_put_work); - ret = wait_for_completion_interruptible(&data->done); - if (!ret) { - mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) > 0) { - /* - * it has been revived by another thread while - * we were unlocked - */ - mutex_unlock(&ctx->uring_lock); - } else { - break; - } - } - - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); - reinit_completion(&data->done); - - ret = io_run_task_work_sig(); - mutex_lock(&ctx->uring_lock); - } while (ret >= 0); - data->quiesce = false; - - return ret; -} - -static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) -{ - unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; - unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; - - return &data->tags[table_idx][off]; -} - -static void io_rsrc_data_free(struct io_rsrc_data *data) -{ - size_t size = data->nr * sizeof(data->tags[0][0]); - - if (data->tags) - io_free_page_table((void **)data->tags, size); - kfree(data); -} - -static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, - u64 __user *utags, unsigned nr, - struct io_rsrc_data **pdata) -{ - struct io_rsrc_data *data; - int ret = -ENOMEM; - unsigned i; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); - if (!data->tags) { - kfree(data); - return -ENOMEM; - } - - data->nr = nr; - data->ctx = ctx; - data->do_put = do_put; - if (utags) { - ret = -EFAULT; - for (i = 0; i < nr; i++) { - u64 *tag_slot = io_get_tag_slot(data, i); - - if (copy_from_user(tag_slot, &utags[i], - sizeof(*tag_slot))) - goto fail; - } - } - - atomic_set(&data->refs, 1); - init_completion(&data->done); - *pdata = data; - return 0; -fail: - io_rsrc_data_free(data); - return ret; -} - -static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) -{ - table->files = kvcalloc(nr_files, sizeof(table->files[0]), - GFP_KERNEL_ACCOUNT); - if (unlikely(!table->files)) - return false; - - table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); - if (unlikely(!table->bitmap)) { - kvfree(table->files); - return false; - } - - return true; -} - -static void io_free_file_tables(struct io_file_table *table) -{ - kvfree(table->files); - bitmap_free(table->bitmap); - table->files = NULL; - table->bitmap = NULL; -} - -static inline void io_file_bitmap_set(struct io_file_table *table, int bit) -{ - WARN_ON_ONCE(test_bit(bit, table->bitmap)); - __set_bit(bit, table->bitmap); - table->alloc_hint = bit + 1; -} - -static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) -{ - __clear_bit(bit, table->bitmap); - table->alloc_hint = bit; -} - -static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ -#if !defined(IO_URING_SCM_ALL) - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(ctx, i); - - if (!file) - continue; - if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) - continue; - io_file_bitmap_clear(&ctx->file_table, i); - fput(file); - } -#endif - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) - kfree_skb(skb); - } -#endif - io_free_file_tables(&ctx->file_table); - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; -} - -static int io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ - int ret; - - if (!ctx->file_data) - return -ENXIO; - ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); - if (!ret) - __io_sqe_files_unregister(ctx); - return ret; -} - -static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) -{ - WARN_ON_ONCE(sqd->thread == current); - - /* - * Do the dance but not conditional clear_bit() because it'd race with - * other threads incrementing park_pending and setting the bit. - */ - clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - if (atomic_dec_return(&sqd->park_pending)) - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_unlock(&sqd->lock); -} - -static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) -{ - WARN_ON_ONCE(sqd->thread == current); - - atomic_inc(&sqd->park_pending); - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); -} - -static void io_sq_thread_stop(struct io_sq_data *sqd) -{ - WARN_ON_ONCE(sqd->thread == current); - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); - - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); - mutex_unlock(&sqd->lock); - wait_for_completion(&sqd->exited); -} - -static void io_put_sq_data(struct io_sq_data *sqd) -{ - if (refcount_dec_and_test(&sqd->refs)) { - WARN_ON_ONCE(atomic_read(&sqd->park_pending)); - - io_sq_thread_stop(sqd); - kfree(sqd); - } -} - -static void io_sq_thread_finish(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd) { - io_sq_thread_park(sqd); - list_del_init(&ctx->sqd_list); - io_sqd_update_thread_idle(sqd); - io_sq_thread_unpark(sqd); - - io_put_sq_data(sqd); - ctx->sq_data = NULL; - } -} - -static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) -{ - struct io_ring_ctx *ctx_attach; - struct io_sq_data *sqd; - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) - return ERR_PTR(-ENXIO); - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - ctx_attach = f.file->private_data; - sqd = ctx_attach->sq_data; - if (!sqd) { - fdput(f); - return ERR_PTR(-EINVAL); - } - if (sqd->task_tgid != current->tgid) { - fdput(f); - return ERR_PTR(-EPERM); - } - - refcount_inc(&sqd->refs); - fdput(f); - return sqd; -} - -static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, - bool *attached) -{ - struct io_sq_data *sqd; - - *attached = false; - if (p->flags & IORING_SETUP_ATTACH_WQ) { - sqd = io_attach_sq_data(p); - if (!IS_ERR(sqd)) { - *attached = true; - return sqd; - } - /* fall through for EPERM case, setup new sqd/task */ - if (PTR_ERR(sqd) != -EPERM) - return sqd; - } - - sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); - if (!sqd) - return ERR_PTR(-ENOMEM); - - atomic_set(&sqd->park_pending, 0); - refcount_set(&sqd->refs, 1); - INIT_LIST_HEAD(&sqd->ctx_list); - mutex_init(&sqd->lock); - init_waitqueue_head(&sqd->wait); - init_completion(&sqd->exited); - return sqd; -} - -/* - * Ensure the UNIX gc is aware of our file set, so we are certain that - * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. We account only files that can hold other - * files because otherwise they can't form a loop and so are not interesting - * for GC. - */ -static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) -{ -#if defined(CONFIG_UNIX) - struct sock *sk = ctx->ring_sock->sk; - struct sk_buff_head *head = &sk->sk_receive_queue; - struct scm_fp_list *fpl; - struct sk_buff *skb; - - if (likely(!io_file_need_scm(file))) - return 0; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) - __skb_unlink(skb, head); - else - skb = NULL; - spin_unlock_irq(&head->lock); - - if (!skb) { - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } - - fpl->user = get_uid(current_user()); - fpl->max = SCM_MAX_FD; - fpl->count = 0; - - UNIXCB(skb).fp = fpl; - skb->sk = sk; - skb->destructor = unix_destruct_scm; - refcount_add(skb->truesize, &sk->sk_wmem_alloc); - } - - fpl = UNIXCB(skb).fp; - fpl->fp[fpl->count++] = get_file(file); - unix_inflight(fpl->user, file); - skb_queue_head(head, skb); - fput(file); -#endif - return 0; -} - -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - struct file *file = prsrc->file; -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - if (!io_file_need_scm(file)) { - fput(file); - return; - } - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(file); -#endif -} - -static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) -{ - struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; - struct io_rsrc_put *prsrc, *tmp; - - list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { - list_del(&prsrc->list); - - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - - spin_lock(&ctx->completion_lock); - io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); - } - - rsrc_data->do_put(ctx, prsrc); - kfree(prsrc); - } - - io_rsrc_node_destroy(ref_node); - if (atomic_dec_and_test(&rsrc_data->refs)) - complete(&rsrc_data->done); -} - -static void io_rsrc_put_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); - - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; - - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; - } -} - -static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args, u64 __user *tags) -{ - __s32 __user *fds = (__s32 __user *) arg; - struct file *file; - int fd, ret; - unsigned i; - - if (ctx->file_data) - return -EBUSY; - if (!nr_args) - return -EINVAL; - if (nr_args > IORING_MAX_FIXED_FILES) - return -EMFILE; - if (nr_args > rlimit(RLIMIT_NOFILE)) - return -EMFILE; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, - &ctx->file_data); - if (ret) - return ret; - - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct io_fixed_file *file_slot; - - if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { - ret = -EFAULT; - goto fail; - } - /* allow sparse sets */ - if (!fds || fd == -1) { - ret = -EINVAL; - if (unlikely(*io_get_tag_slot(ctx->file_data, i))) - goto fail; - continue; - } - - file = fget(fd); - ret = -EBADF; - if (unlikely(!file)) - goto fail; - - /* - * Don't allow io_uring instances to be registered. If UNIX - * isn't enabled, then this causes a reference cycle and this - * instance can never get freed. If UNIX is enabled we'll - * handle it just fine, but there's still no point in allowing - * a ring fd as it doesn't support regular read/write anyway. - */ - if (file->f_op == &io_uring_fops) { - fput(file); - goto fail; - } - ret = io_scm_file_account(ctx, file); - if (ret) { - fput(file); - goto fail; - } - file_slot = io_fixed_file_slot(&ctx->file_table, i); - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, i); - } - - io_rsrc_node_switch(ctx, NULL); - return 0; -fail: - __io_sqe_files_unregister(ctx); - return ret; -} - -static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) -{ - u64 *tag_slot = io_get_tag_slot(data, idx); - struct io_rsrc_put *prsrc; - - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) - return -ENOMEM; - - prsrc->tag = *tag_slot; - *tag_slot = 0; - prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); - return 0; -} - -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) - __must_hold(&req->ctx->uring_lock) -{ - struct io_ring_ctx *ctx = req->ctx; - bool needs_switch = false; - struct io_fixed_file *file_slot; - int ret; - - if (file->f_op == &io_uring_fops) - return -EBADF; - if (!ctx->file_data) - return -ENXIO; - if (slot_index >= ctx->nr_user_files) - return -EINVAL; - - slot_index = array_index_nospec(slot_index, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - - if (file_slot->file_ptr) { - struct file *old_file; - - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - ctx->rsrc_node, old_file); - if (ret) - goto err; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); - needs_switch = true; - } - - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } -err: - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->file_data); - if (ret) - fput(file); - return ret; -} - -static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, - unsigned int offset) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *file_slot; - struct file *file; - int ret; - - io_ring_submit_lock(ctx, issue_flags); - ret = -ENXIO; - if (unlikely(!ctx->file_data)) - goto out; - ret = -EINVAL; - if (offset >= ctx->nr_user_files) - goto out; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto out; - - offset = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, offset); - ret = -EBADF; - if (!file_slot->file_ptr) - goto out; - - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); - if (ret) - goto out; - - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, offset); - io_rsrc_node_switch(ctx, ctx->file_data); - ret = 0; -out: - io_ring_submit_unlock(ctx, issue_flags); - return ret; -} - -static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) -{ - return __io_close_fixed(req, issue_flags, req->close.file_slot - 1); -} - -static int __io_sqe_files_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update2 *up, - unsigned nr_args) -{ - u64 __user *tags = u64_to_user_ptr(up->tags); - __s32 __user *fds = u64_to_user_ptr(up->data); - struct io_rsrc_data *data = ctx->file_data; - struct io_fixed_file *file_slot; - struct file *file; - int fd, i, err = 0; - unsigned int done; - bool needs_switch = false; - - if (!ctx->file_data) - return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_files) - return -EINVAL; - - for (done = 0; done < nr_args; done++) { - u64 tag = 0; - - if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || - copy_from_user(&fd, &fds[done], sizeof(fd))) { - err = -EFAULT; - break; - } - if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { - err = -EINVAL; - break; - } - if (fd == IORING_REGISTER_FILES_SKIP) - continue; - - i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); - - if (file_slot->file_ptr) { - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); - if (err) - break; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, i); - needs_switch = true; - } - if (fd != -1) { - file = fget(fd); - if (!file) { - err = -EBADF; - break; - } - /* - * Don't allow io_uring instances to be registered. If - * UNIX isn't enabled, then this causes a reference - * cycle and this instance can never get freed. If UNIX - * is enabled we'll handle it just fine, but there's - * still no point in allowing a ring fd as it doesn't - * support regular read/write anyway. - */ - if (file->f_op == &io_uring_fops) { - fput(file); - err = -EBADF; - break; - } - err = io_scm_file_account(ctx, file); - if (err) { - fput(file); - break; - } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, i); - } - } - - if (needs_switch) - io_rsrc_node_switch(ctx, data); - return done ? done : err; -} - -static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, - struct task_struct *task) -{ - struct io_wq_hash *hash; - struct io_wq_data data; - unsigned int concurrency; - - mutex_lock(&ctx->uring_lock); - hash = ctx->hash_map; - if (!hash) { - hash = kzalloc(sizeof(*hash), GFP_KERNEL); - if (!hash) { - mutex_unlock(&ctx->uring_lock); - return ERR_PTR(-ENOMEM); - } - refcount_set(&hash->refs, 1); - init_waitqueue_head(&hash->wait); - ctx->hash_map = hash; - } - mutex_unlock(&ctx->uring_lock); - - data.hash = hash; - data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - - return io_wq_create(concurrency, &data); -} - -static __cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx; - int ret; - - tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); - if (unlikely(!tctx)) - return -ENOMEM; - - tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, - sizeof(struct file *), GFP_KERNEL); - if (unlikely(!tctx->registered_rings)) { - kfree(tctx); - return -ENOMEM; - } - - ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); - if (unlikely(ret)) { - kfree(tctx->registered_rings); - kfree(tctx); - return ret; - } - - tctx->io_wq = io_init_wq_offload(ctx, task); - if (IS_ERR(tctx->io_wq)) { - ret = PTR_ERR(tctx->io_wq); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx->registered_rings); - kfree(tctx); - return ret; - } - - xa_init(&tctx->xa); - init_waitqueue_head(&tctx->wait); - atomic_set(&tctx->in_idle, 0); - atomic_set(&tctx->inflight_tracked, 0); - task->io_uring = tctx; - spin_lock_init(&tctx->task_lock); - INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); - init_task_work(&tctx->task_work, tctx_task_work); - return 0; -} - -void __io_uring_free(struct task_struct *tsk) -{ - struct io_uring_task *tctx = tsk->io_uring; - - WARN_ON_ONCE(!xa_empty(&tctx->xa)); - WARN_ON_ONCE(tctx->io_wq); - WARN_ON_ONCE(tctx->cached_refs); - - kfree(tctx->registered_rings); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx); - tsk->io_uring = NULL; -} - -static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, - struct io_uring_params *p) -{ - int ret; - - /* Retain compatibility with failing for an invalid attach attempt */ - if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == - IORING_SETUP_ATTACH_WQ) { - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) - return -ENXIO; - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return -EINVAL; - } - fdput(f); - } - if (ctx->flags & IORING_SETUP_SQPOLL) { - struct task_struct *tsk; - struct io_sq_data *sqd; - bool attached; - - ret = security_uring_sqpoll(); - if (ret) - return ret; - - sqd = io_get_sq_data(p, &attached); - if (IS_ERR(sqd)) { - ret = PTR_ERR(sqd); - goto err; - } - - ctx->sq_creds = get_current_cred(); - ctx->sq_data = sqd; - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); - if (!ctx->sq_thread_idle) - ctx->sq_thread_idle = HZ; - - io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); - /* don't attach to a dying SQPOLL thread, would be racy */ - ret = (attached && !sqd->thread) ? -ENXIO : 0; - io_sq_thread_unpark(sqd); - - if (ret < 0) - goto err; - if (attached) - return 0; - - if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu = p->sq_thread_cpu; - - ret = -EINVAL; - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) - goto err_sqpoll; - sqd->sq_cpu = cpu; - } else { - sqd->sq_cpu = -1; - } - - sqd->task_pid = current->pid; - sqd->task_tgid = current->tgid; - tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - goto err_sqpoll; - } - - sqd->thread = tsk; - ret = io_uring_alloc_task_context(tsk, ctx); - wake_up_new_task(tsk); - if (ret) - goto err; - } else if (p->flags & IORING_SETUP_SQ_AFF) { - /* Can't have SQ_AFF without SQPOLL */ - ret = -EINVAL; - goto err; - } - - return 0; -err_sqpoll: - complete(&ctx->sq_data->exited); -err: - io_sq_thread_finish(ctx); - return ret; -} - -static inline void __io_unaccount_mem(struct user_struct *user, - unsigned long nr_pages) -{ - atomic_long_sub(nr_pages, &user->locked_vm); -} - -static inline int __io_account_mem(struct user_struct *user, - unsigned long nr_pages) -{ - unsigned long page_limit, cur_pages, new_pages; - - /* Don't allow more pages than we can safely lock */ - page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - do { - cur_pages = atomic_long_read(&user->locked_vm); - new_pages = cur_pages + nr_pages; - if (new_pages > page_limit) - return -ENOMEM; - } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, - new_pages) != cur_pages); - - return 0; -} - -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) -{ - if (ctx->user) - __io_unaccount_mem(ctx->user, nr_pages); - - if (ctx->mm_account) - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); -} - -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) -{ - int ret; - - if (ctx->user) { - ret = __io_account_mem(ctx->user, nr_pages); - if (ret) - return ret; - } - - if (ctx->mm_account) - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); - - return 0; -} - -static void io_mem_free(void *ptr) -{ - struct page *page; - - if (!ptr) - return; - - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); -} - -static void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - - return (void *) __get_free_pages(gfp, get_order(size)); -} - -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) -{ - struct io_rings *rings; - size_t off, sq_array_size; - - off = struct_size(rings, cqes, cq_entries); - if (off == SIZE_MAX) - return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { - if (check_shl_overflow(off, 1, &off)) - return SIZE_MAX; - } - -#ifdef CONFIG_SMP - off = ALIGN(off, SMP_CACHE_BYTES); - if (off == 0) - return SIZE_MAX; -#endif - - if (sq_offset) - *sq_offset = off; - - sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; - - if (check_add_overflow(off, sq_array_size, &off)) - return SIZE_MAX; - - return off; -} - -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) -{ - struct io_mapped_ubuf *imu = *slot; - unsigned int i; - - if (imu != ctx->dummy_ubuf) { - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); - } - *slot = NULL; -} - -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - io_buffer_unmap(ctx, &prsrc->buf); - prsrc->buf = NULL; -} - -static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned int i; - - for (i = 0; i < ctx->nr_user_bufs; i++) - io_buffer_unmap(ctx, &ctx->user_bufs[i]); - kfree(ctx->user_bufs); - io_rsrc_data_free(ctx->buf_data); - ctx->user_bufs = NULL; - ctx->buf_data = NULL; - ctx->nr_user_bufs = 0; -} - -static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - int ret; - - if (!ctx->buf_data) - return -ENXIO; - - ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); - if (!ret) - __io_sqe_buffers_unregister(ctx); - return ret; -} - -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - -/* - * Not super efficient, but this is just a registration time. And we do cache - * the last compound head, so generally we'll only do a full search if we don't - * match that one. - * - * We check if the given compound head page has already been accounted, to - * avoid double accounting it. This allows us to account the full size of the - * page, not just the constituent pages of a huge page. - */ -static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct page *hpage) -{ - int i, j; - - /* check current page array */ - for (i = 0; i < nr_pages; i++) { - if (!PageCompound(pages[i])) - continue; - if (compound_head(pages[i]) == hpage) - return true; - } - - /* check previously registered pages */ - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = ctx->user_bufs[i]; - - for (j = 0; j < imu->nr_bvecs; j++) { - if (!PageCompound(imu->bvec[j].bv_page)) - continue; - if (compound_head(imu->bvec[j].bv_page) == hpage) - return true; - } - } - - return false; -} - -static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct io_mapped_ubuf *imu, - struct page **last_hpage) -{ - int i, ret; - - imu->acct_pages = 0; - for (i = 0; i < nr_pages; i++) { - if (!PageCompound(pages[i])) { - imu->acct_pages++; - } else { - struct page *hpage; - - hpage = compound_head(pages[i]); - if (hpage == *last_hpage) - continue; - *last_hpage = hpage; - if (headpage_already_acct(ctx, pages, i, hpage)) - continue; - imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; - } - } - - if (!imu->acct_pages) - return 0; - - ret = io_account_mem(ctx, imu->acct_pages); - if (ret) - imu->acct_pages = 0; - return ret; -} - -static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, - int *npages) -{ - unsigned long start, end, nr_pages; - struct vm_area_struct **vmas = NULL; - struct page **pages = NULL; - int i, pret, ret = -ENOMEM; - - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; - - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto done; - - vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - goto done; - - ret = 0; - mmap_read_lock(current->mm); - pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages, vmas); - if (pret == nr_pages) { - /* don't support file backed memory */ - for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma_is_shmem(vma)) - continue; - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { - ret = -EOPNOTSUPP; - break; - } - } - *npages = nr_pages; - } else { - ret = pret < 0 ? pret : -EFAULT; - } - mmap_read_unlock(current->mm); - if (ret) { - /* - * if we did partial map, or found file backed vmas, - * release any pages we did get - */ - if (pret > 0) - unpin_user_pages(pages, pret); - goto done; - } - ret = 0; -done: - kvfree(vmas); - if (ret < 0) { - kvfree(pages); - pages = ERR_PTR(ret); - } - return pages; -} - -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) -{ - struct io_mapped_ubuf *imu = NULL; - struct page **pages = NULL; - unsigned long off; - size_t size; - int ret, nr_pages, i; - - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; - return 0; - } - - *pimu = NULL; - ret = -ENOMEM; - - pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, - &nr_pages); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - pages = NULL; - goto done; - } - - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); - if (!imu) - goto done; - - ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); - goto done; - } - - off = (unsigned long) iov->iov_base & ~PAGE_MASK; - size = iov->iov_len; - for (i = 0; i < nr_pages; i++) { - size_t vec_len; - - vec_len = min_t(size_t, size, PAGE_SIZE - off); - imu->bvec[i].bv_page = pages[i]; - imu->bvec[i].bv_len = vec_len; - imu->bvec[i].bv_offset = off; - off = 0; - size -= vec_len; - } - /* store original address for later verification */ - imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; - imu->nr_bvecs = nr_pages; - *pimu = imu; - ret = 0; -done: - if (ret) - kvfree(imu); - kvfree(pages); - return ret; -} - -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) -{ - ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); - return ctx->user_bufs ? 0 : -ENOMEM; -} - -static int io_buffer_validate(struct iovec *iov) -{ - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); - - /* - * Don't impose further limits on the size and buffer - * constraints here, we'll -EINVAL later when IO is - * submitted if they are wrong. - */ - if (!iov->iov_base) - return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; -} - -static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int nr_args, u64 __user *tags) -{ - struct page *last_hpage = NULL; - struct io_rsrc_data *data; - int i, ret; - struct iovec iov; - - if (ctx->user_bufs) - return -EBUSY; - if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) - return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); - if (ret) - return ret; - ret = io_buffers_map_alloc(ctx, nr_args); - if (ret) { - io_rsrc_data_free(data); - return ret; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - break; - ret = io_buffer_validate(&iov); - if (ret) - break; - } else { - memset(&iov, 0, sizeof(iov)); - } - - if (!iov.iov_base && *io_get_tag_slot(data, i)) { - ret = -EINVAL; - break; - } - - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], - &last_hpage); - if (ret) - break; - } - - WARN_ON_ONCE(ctx->buf_data); - - ctx->buf_data = data; - if (ret) - __io_sqe_buffers_unregister(ctx); - else - io_rsrc_node_switch(ctx, NULL); - return ret; -} - -static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update2 *up, - unsigned int nr_args) -{ - u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); - struct page *last_hpage = NULL; - bool needs_switch = false; - __u32 done; - int i, err; - - if (!ctx->buf_data) - return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_bufs) - return -EINVAL; - - for (done = 0; done < nr_args; done++) { - struct io_mapped_ubuf *imu; - int offset = up->offset + done; - u64 tag = 0; - - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) - break; - if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { - err = -EFAULT; - break; - } - err = io_buffer_validate(&iov); - if (err) - break; - if (!iov.iov_base && tag) { - err = -EINVAL; - break; - } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); - if (err) - break; - - i = array_index_nospec(offset, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != ctx->dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->rsrc_node, ctx->user_bufs[i]); - if (unlikely(err)) { - io_buffer_unmap(ctx, &imu); - break; - } - ctx->user_bufs[i] = NULL; - needs_switch = true; - } - - ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, offset) = tag; - } - - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->buf_data); - return done ? done : err; -} - -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - return 0; -} - -static void io_eventfd_put(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); -} - -static int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - call_rcu(&ev_fd->rcu, io_eventfd_put); - return 0; - } - - return -ENXIO; -} - -static void io_destroy_buffers(struct io_ring_ctx *ctx) -{ - struct io_buffer_list *bl; - unsigned long index; - int i; - - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } - - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree(bl); - } - - while (!list_empty(&ctx->io_buffers_pages)) { - struct page *page; - - page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); - list_del_init(&page->lru); - __free_page(page); - } -} - -static void io_req_caches_free(struct io_ring_ctx *ctx) -{ - struct io_submit_state *state = &ctx->submit_state; - int nr = 0; - - mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, state); - - while (!io_req_cache_empty(ctx)) { - struct io_wq_work_node *node; - struct io_kiocb *req; - - node = wq_stack_extract(&state->free_list); - req = container_of(node, struct io_kiocb, comp_list); - kmem_cache_free(req_cachep, req); - nr++; - } - if (nr) - percpu_ref_put_many(&ctx->refs, nr); - mutex_unlock(&ctx->uring_lock); -} - -static void io_wait_rsrc_data(struct io_rsrc_data *data) -{ - if (data && !atomic_dec_and_test(&data->refs)) - wait_for_completion(&data->done); -} - -static void io_flush_apoll_cache(struct io_ring_ctx *ctx) -{ - struct async_poll *apoll; - - while (!list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del(&apoll->poll.wait.entry); - kfree(apoll); - } -} - -static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) -{ - io_sq_thread_finish(ctx); - - if (ctx->mm_account) { - mmdrop(ctx->mm_account); - ctx->mm_account = NULL; - } - - io_rsrc_refs_drop(ctx); - /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - io_wait_rsrc_data(ctx->buf_data); - io_wait_rsrc_data(ctx->file_data); - - mutex_lock(&ctx->uring_lock); - if (ctx->buf_data) - __io_sqe_buffers_unregister(ctx); - if (ctx->file_data) - __io_sqe_files_unregister(ctx); - if (ctx->rings) - __io_cqring_overflow_flush(ctx, true); - io_eventfd_unregister(ctx); - io_flush_apoll_cache(ctx); - mutex_unlock(&ctx->uring_lock); - io_destroy_buffers(ctx); - if (ctx->sq_creds) - put_cred(ctx->sq_creds); - - /* there are no registered resources left, nobody uses it */ - if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx->rsrc_node); - if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx->rsrc_backup_node); - flush_delayed_work(&ctx->rsrc_put_work); - flush_delayed_work(&ctx->fallback_work); - - WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - ctx->ring_sock->file = NULL; /* so that iput() is called */ - sock_release(ctx->ring_sock); - } -#endif - WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); - - percpu_ref_exit(&ctx->refs); - free_uid(ctx->user); - io_req_caches_free(ctx); - if (ctx->hash_map) - io_wq_put_hash(ctx->hash_map); - kfree(ctx->cancel_hash); - kfree(ctx->dummy_ubuf); - kfree(ctx->io_bl); - xa_destroy(&ctx->io_bl_xa); - kfree(ctx); -} - -static __poll_t io_uring_poll(struct file *file, poll_table *wait) -{ - struct io_ring_ctx *ctx = file->private_data; - __poll_t mask = 0; - - poll_wait(file, &ctx->cq_wait, wait); - /* - * synchronizes with barrier from wq_has_sleeper call in - * io_commit_cqring - */ - smp_rmb(); - if (!io_sqring_full(ctx)) - mask |= EPOLLOUT | EPOLLWRNORM; - - /* - * Don't flush cqring overflow list here, just do a simple check. - * Otherwise there could possible be ABBA deadlock: - * CPU0 CPU1 - * ---- ---- - * lock(&ctx->uring_lock); - * lock(&ep->mtx); - * lock(&ctx->uring_lock); - * lock(&ep->mtx); - * - * Users may get EPOLLIN meanwhile seeing nothing in cqring, this - * pushs them to do the flush. - */ - if (io_cqring_events(ctx) || - test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - mask |= EPOLLIN | EPOLLRDNORM; - - return mask; -} - -static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) -{ - const struct cred *creds; - - creds = xa_erase(&ctx->personalities, id); - if (creds) { - put_cred(creds); - return 0; - } - - return -EINVAL; -} - -struct io_tctx_exit { - struct callback_head task_work; - struct completion completion; - struct io_ring_ctx *ctx; -}; - -static __cold void io_tctx_exit_cb(struct callback_head *cb) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_exit *work; - - work = container_of(cb, struct io_tctx_exit, task_work); - /* - * When @in_idle, we're in cancellation and it's racy to remove the - * node. It'll be removed by the end of cancellation, just ignore it. - */ - if (!atomic_read(&tctx->in_idle)) - io_uring_del_tctx_node((unsigned long)work->ctx); - complete(&work->completion); -} - -static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - -static __cold void io_ring_exit_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); - unsigned long timeout = jiffies + HZ * 60 * 5; - unsigned long interval = HZ / 20; - struct io_tctx_exit exit; - struct io_tctx_node *node; - int ret; - - /* - * If we're doing polled IO and end up having requests being - * submitted async (out-of-line), then completions can come in while - * we're waiting for refs to drop. We need to reap these manually, - * as nobody else will be looking for them. - */ - do { - io_uring_try_cancel_requests(ctx, NULL, true); - if (ctx->sq_data) { - struct io_sq_data *sqd = ctx->sq_data; - struct task_struct *tsk; - - io_sq_thread_park(sqd); - tsk = sqd->thread; - if (tsk && tsk->io_uring && tsk->io_uring->io_wq) - io_wq_cancel_cb(tsk->io_uring->io_wq, - io_cancel_ctx_cb, ctx, true); - io_sq_thread_unpark(sqd); - } - - io_req_caches_free(ctx); - - if (WARN_ON_ONCE(time_after(jiffies, timeout))) { - /* there is little hope left, don't run it too often */ - interval = HZ * 60; - } - } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); - - init_completion(&exit.completion); - init_task_work(&exit.task_work, io_tctx_exit_cb); - exit.ctx = ctx; - /* - * Some may use context even when all refs and requests have been put, - * and they are free to do so while still holding uring_lock or - * completion_lock, see io_req_task_submit(). Apart from other work, - * this lock/unlock section also waits them to finish. - */ - mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->tctx_list)) { - WARN_ON_ONCE(time_after(jiffies, timeout)); - - node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, - ctx_node); - /* don't spin on a single task if cancellation failed */ - list_rotate_left(&ctx->tctx_list); - ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); - if (WARN_ON_ONCE(ret)) - continue; - - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&exit.completion); - mutex_lock(&ctx->uring_lock); - } - mutex_unlock(&ctx->uring_lock); - spin_lock(&ctx->completion_lock); - spin_unlock(&ctx->completion_lock); - - io_ring_ctx_free(ctx); -} - -/* Returns true if we found and killed one or more timeouts */ -static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, - struct task_struct *tsk, bool cancel_all) -{ - struct io_kiocb *req, *tmp; - int canceled = 0; - - spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_match_task(req, tsk, cancel_all)) { - io_kill_timeout(req, -ECANCELED); - canceled++; - } - } - spin_unlock_irq(&ctx->timeout_lock); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (canceled != 0) - io_cqring_ev_posted(ctx); - return canceled != 0; -} - -static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) -{ - unsigned long index; - struct creds *creds; - - mutex_lock(&ctx->uring_lock); - percpu_ref_kill(&ctx->refs); - if (ctx->rings) - __io_cqring_overflow_flush(ctx, true); - xa_for_each(&ctx->personalities, index, creds) - io_unregister_personality(ctx, index); - mutex_unlock(&ctx->uring_lock); - - /* failed during ring init, it couldn't have issued any requests */ - if (ctx->rings) { - io_kill_timeouts(ctx, NULL, true); - io_poll_remove_all(ctx, NULL, true); - /* if we failed setting up the ctx, we might not have any rings */ - io_iopoll_try_reap_events(ctx); - } - - INIT_WORK(&ctx->exit_work, io_ring_exit_work); - /* - * Use system_unbound_wq to avoid spawning tons of event kworkers - * if we're exiting a ton of rings at the same time. It just adds - * noise and overhead, there's no discernable change in runtime - * over using system_wq. - */ - queue_work(system_unbound_wq, &ctx->exit_work); -} - -static int io_uring_release(struct inode *inode, struct file *file) -{ - struct io_ring_ctx *ctx = file->private_data; - - file->private_data = NULL; - io_ring_ctx_wait_and_kill(ctx); - return 0; -} - -struct io_task_cancel { - struct task_struct *task; - bool all; -}; - -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_task_cancel *cancel = data; - - return io_match_task_safe(req, cancel->task, cancel->all); -} - -static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) -{ - struct io_defer_entry *de; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, task, cancel_all)) { - list_cut_position(&list, &ctx->defer_list, &de->list); - break; - } - } - spin_unlock(&ctx->completion_lock); - if (list_empty(&list)) - return false; - - while (!list_empty(&list)) { - de = list_first_entry(&list, struct io_defer_entry, list); - list_del_init(&de->list); - io_req_complete_failed(de->req, -ECANCELED); - kfree(de); - } - return true; -} - -static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) -{ - struct io_tctx_node *node; - enum io_wq_cancel cret; - bool ret = false; - - mutex_lock(&ctx->uring_lock); - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - /* - * io_wq will stay alive while we hold uring_lock, because it's - * killed after ctx nodes, which requires to take the lock. - */ - if (!tctx || !tctx->io_wq) - continue; - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - mutex_unlock(&ctx->uring_lock); - - return ret; -} - -static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) -{ - struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; - struct io_uring_task *tctx = task ? task->io_uring : NULL; - - /* failed during ring init, it couldn't have issued any requests */ - if (!ctx->rings) - return; - - while (1) { - enum io_wq_cancel cret; - bool ret = false; - - if (!task) { - ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { - /* - * Cancels requests of all rings, not only @ctx, but - * it's fine as the task is in exit/exec. - */ - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - - /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!wq_list_empty(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - } - } - - ret |= io_cancel_defer_files(ctx, task, cancel_all); - ret |= io_poll_remove_all(ctx, task, cancel_all); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) - ret |= io_run_task_work(); - if (!ret) - break; - cond_resched(); - } -} - -static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - int ret; - - if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current, ctx); - if (unlikely(ret)) - return ret; - - tctx = current->io_uring; - if (ctx->iowq_limits_set) { - unsigned int limits[2] = { ctx->iowq_limits[0], - ctx->iowq_limits[1], }; - - ret = io_wq_max_workers(tctx->io_wq, limits); - if (ret) - return ret; - } - } - if (!xa_load(&tctx->xa, (unsigned long)ctx)) { - node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - node->ctx = ctx; - node->task = current; - - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, - node, GFP_KERNEL)); - if (ret) { - kfree(node); - return ret; - } - - mutex_lock(&ctx->uring_lock); - list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->uring_lock); - } - tctx->last = ctx; - return 0; -} - -/* - * Note that this task has used io_uring. We use it for cancelation purposes. - */ -static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - - if (likely(tctx && tctx->last == ctx)) - return 0; - return __io_uring_add_tctx_node(ctx); -} - -/* - * Remove this io_uring_file -> task mapping. - */ -static __cold void io_uring_del_tctx_node(unsigned long index) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - - if (!tctx) - return; - node = xa_erase(&tctx->xa, index); - if (!node) - return; - - WARN_ON_ONCE(current != node->task); - WARN_ON_ONCE(list_empty(&node->ctx_node)); - - mutex_lock(&node->ctx->uring_lock); - list_del(&node->ctx_node); - mutex_unlock(&node->ctx->uring_lock); - - if (tctx->last == node->ctx) - tctx->last = NULL; - kfree(node); -} - -static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) -{ - struct io_wq *wq = tctx->io_wq; - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - io_uring_del_tctx_node(index); - cond_resched(); - } - if (wq) { - /* - * Must be after io_uring_del_tctx_node() (removes nodes under - * uring_lock) to avoid race with io_uring_try_cancel_iowq(). - */ - io_wq_put_and_exit(wq); - tctx->io_wq = NULL; - } -} - -static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) -{ - if (tracked) - return atomic_read(&tctx->inflight_tracked); - return percpu_counter_sum(&tctx->inflight); -} - -/* - * Find any io_uring ctx that this task has registered or done IO on, and cancel - * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. - */ -static __cold void io_uring_cancel_generic(bool cancel_all, - struct io_sq_data *sqd) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_ring_ctx *ctx; - s64 inflight; - DEFINE_WAIT(wait); - - WARN_ON_ONCE(sqd && sqd->thread != current); - - if (!current->io_uring) - return; - if (tctx->io_wq) - io_wq_exit_start(tctx->io_wq); - - atomic_inc(&tctx->in_idle); - do { - io_uring_drop_tctx_refs(current); - /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); - if (!inflight) - break; - - if (!sqd) { - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - /* sqpoll task will cancel all its requests */ - if (node->ctx->sq_data) - continue; - io_uring_try_cancel_requests(node->ctx, current, - cancel_all); - } - } else { - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_try_cancel_requests(ctx, current, - cancel_all); - } - - prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); - io_run_task_work(); - io_uring_drop_tctx_refs(current); - - /* - * If we've seen completions, retry without waiting. This - * avoids a race where a completion comes in before we did - * prepare_to_wait(). - */ - if (inflight == tctx_inflight(tctx, !cancel_all)) - schedule(); - finish_wait(&tctx->wait, &wait); - } while (1); - - io_uring_clean_tctx(tctx); - if (cancel_all) { - /* - * We shouldn't run task_works after cancel, so just leave - * ->in_idle set for normal exit. - */ - atomic_dec(&tctx->in_idle); - /* for exec all current's requests should be gone, kill tctx */ - __io_uring_free(current); - } -} - -void __io_uring_cancel(bool cancel_all) -{ - io_uring_cancel_generic(cancel_all, NULL); -} - -void io_uring_unreg_ringfd(void) -{ - struct io_uring_task *tctx = current->io_uring; - int i; - - for (i = 0; i < IO_RINGFD_REG_MAX; i++) { - if (tctx->registered_rings[i]) { - fput(tctx->registered_rings[i]); - tctx->registered_rings[i] = NULL; - } - } -} - -static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, - int start, int end) -{ - struct file *file; - int offset; - - for (offset = start; offset < end; offset++) { - offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[offset]) - continue; - - file = fget(fd); - if (!file) { - return -EBADF; - } else if (file->f_op != &io_uring_fops) { - fput(file); - return -EOPNOTSUPP; - } - tctx->registered_rings[offset] = file; - return offset; - } - - return -EBUSY; -} - -/* - * Register a ring fd to avoid fdget/fdput for each io_uring_enter() - * invocation. User passes in an array of struct io_uring_rsrc_update - * with ->data set to the ring_fd, and ->offset given for the desired - * index. If no index is desired, application may set ->offset == -1U - * and we'll find an available index. Returns number of entries - * successfully processed, or < 0 on error if none were processed. - */ -static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update __user *arg = __arg; - struct io_uring_rsrc_update reg; - struct io_uring_task *tctx; - int ret, i; - - if (!nr_args || nr_args > IO_RINGFD_REG_MAX) - return -EINVAL; - - mutex_unlock(&ctx->uring_lock); - ret = io_uring_add_tctx_node(ctx); - mutex_lock(&ctx->uring_lock); - if (ret) - return ret; - - tctx = current->io_uring; - for (i = 0; i < nr_args; i++) { - int start, end; - - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } - - if (reg.resv) { - ret = -EINVAL; - break; - } - - if (reg.offset == -1U) { - start = 0; - end = IO_RINGFD_REG_MAX; - } else { - if (reg.offset >= IO_RINGFD_REG_MAX) { - ret = -EINVAL; - break; - } - start = reg.offset; - end = start + 1; - } - - ret = io_ring_add_registered_fd(tctx, reg.data, start, end); - if (ret < 0) - break; - - reg.offset = ret; - if (copy_to_user(&arg[i], ®, sizeof(reg))) { - fput(tctx->registered_rings[reg.offset]); - tctx->registered_rings[reg.offset] = NULL; - ret = -EFAULT; - break; - } - } - - return i ? i : ret; -} - -static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update __user *arg = __arg; - struct io_uring_task *tctx = current->io_uring; - struct io_uring_rsrc_update reg; - int ret = 0, i; - - if (!nr_args || nr_args > IO_RINGFD_REG_MAX) - return -EINVAL; - if (!tctx) - return 0; - - for (i = 0; i < nr_args; i++) { - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } - if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { - ret = -EINVAL; - break; - } - - reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[reg.offset]) { - fput(tctx->registered_rings[reg.offset]); - tctx->registered_rings[reg.offset] = NULL; - } - } - - return i ? i : ret; -} - -static void *io_uring_validate_mmap_request(struct file *file, - loff_t pgoff, size_t sz) -{ - struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - - switch (offset) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - ptr = ctx->rings; - break; - case IORING_OFF_SQES: - ptr = ctx->sq_sqes; - break; - default: - return ERR_PTR(-EINVAL); - } - - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); - - return ptr; -} - -#ifdef CONFIG_MMU - -static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - size_t sz = vma->vm_end - vma->vm_start; - unsigned long pfn; - void *ptr; - - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); -} - -#else /* !CONFIG_MMU */ - -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; -} - -static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) -{ - return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; -} - -static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - ptr = io_uring_validate_mmap_request(file, pgoff, len); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - return (unsigned long) ptr; -} - -#endif /* !CONFIG_MMU */ - -static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) -{ - DEFINE_WAIT(wait); - - do { - if (!io_sqring_full(ctx)) - break; - prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); - - if (!io_sqring_full(ctx)) - break; - schedule(); - } while (!signal_pending(current)); - - finish_wait(&ctx->sqo_sq_wait, &wait); - return 0; -} - -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) -{ - if (flags & IORING_ENTER_EXT_ARG) { - struct io_uring_getevents_arg arg; - - if (argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; - } - return 0; -} - -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, - struct __kernel_timespec __user **ts, - const sigset_t __user **sig) -{ - struct io_uring_getevents_arg arg; - - /* - * If EXT_ARG isn't set, then we have no timespec and the argp pointer - * is just a pointer to the sigset_t. - */ - if (!(flags & IORING_ENTER_EXT_ARG)) { - *sig = (const sigset_t __user *) argp; - *ts = NULL; - return 0; - } - - /* - * EXT_ARG is set - ensure we agree on the size of it and copy in our - * timespec and sigset_t pointers if good. - */ - if (*argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; - if (arg.pad) - return -EINVAL; - *sig = u64_to_user_ptr(arg.sigmask); - *argsz = arg.sigmask_sz; - *ts = u64_to_user_ptr(arg.ts); - return 0; -} - -SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, - u32, min_complete, u32, flags, const void __user *, argp, - size_t, argsz) -{ - struct io_ring_ctx *ctx; - struct fd f; - long ret; - - io_run_task_work(); - - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING))) - return -EINVAL; - - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - if (flags & IORING_ENTER_REGISTERED_RING) { - struct io_uring_task *tctx = current->io_uring; - - if (!tctx || fd >= IO_RINGFD_REG_MAX) - return -EINVAL; - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - f.file = tctx->registered_rings[fd]; - f.flags = 0; - } else { - f = fdget(fd); - } - - if (unlikely(!f.file)) - return -EBADF; - - ret = -EOPNOTSUPP; - if (unlikely(f.file->f_op != &io_uring_fops)) - goto out_fput; - - ret = -ENXIO; - ctx = f.file->private_data; - if (unlikely(!percpu_ref_tryget(&ctx->refs))) - goto out_fput; - - ret = -EBADFD; - if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) - goto out; - - /* - * For SQ polling, the thread will do all submissions and completions. - * Just return the requested submit count, and wake the thread if - * we were asked to. - */ - ret = 0; - if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx); - - if (unlikely(ctx->sq_data->thread == NULL)) { - ret = -EOWNERDEAD; - goto out; - } - if (flags & IORING_ENTER_SQ_WAKEUP) - wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) { - ret = io_sqpoll_wait_sq(ctx); - if (ret) - goto out; - } - ret = to_submit; - } else if (to_submit) { - ret = io_uring_add_tctx_node(ctx); - if (unlikely(ret)) - goto out; - - mutex_lock(&ctx->uring_lock); - ret = io_submit_sqes(ctx, to_submit); - if (ret != to_submit) { - mutex_unlock(&ctx->uring_lock); - goto out; - } - if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) - goto iopoll_locked; - mutex_unlock(&ctx->uring_lock); - } - if (flags & IORING_ENTER_GETEVENTS) { - int ret2; - if (ctx->syscall_iopoll) { - /* - * We disallow the app entering submit/complete with - * polling, but we still need to lock the ring to - * prevent racing with polled issue that got punted to - * a workqueue. - */ - mutex_lock(&ctx->uring_lock); -iopoll_locked: - ret2 = io_validate_ext_arg(flags, argp, argsz); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); - ret2 = io_iopoll_check(ctx, min_complete); - } - mutex_unlock(&ctx->uring_lock); - } else { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; - - ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); - ret2 = io_cqring_wait(ctx, min_complete, sig, - argsz, ts); - } - } - - if (!ret) { - ret = ret2; - - /* - * EBADR indicates that one or more CQE were dropped. - * Once the user has been informed we can clear the bit - * as they are obviously ok with those drops. - */ - if (unlikely(ret2 == -EBADR)) - clear_bit(IO_CHECK_CQ_DROPPED_BIT, - &ctx->check_cq); - } - } - -out: - percpu_ref_put(&ctx->refs); -out_fput: - fdput(f); - return ret; -} - -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - unsigned __capi; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - CAP_FOR_EACH_U32(__capi) - seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); - seq_putc(m, '\n'); - return 0; -} - -static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, - struct seq_file *m) -{ - struct io_sq_data *sq = NULL; - struct io_overflow_cqe *ocqe; - struct io_rings *r = ctx->rings; - unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; - unsigned int sq_head = READ_ONCE(r->sq.head); - unsigned int sq_tail = READ_ONCE(r->sq.tail); - unsigned int cq_head = READ_ONCE(r->cq.head); - unsigned int cq_tail = READ_ONCE(r->cq.tail); - unsigned int cq_shift = 0; - unsigned int sq_entries, cq_entries; - bool has_lock; - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - unsigned int i; - - if (is_cqe32) - cq_shift = 1; - - /* - * we may get imprecise sqe and cqe info if uring is actively running - * since we get cached_sq_head and cached_cq_tail without uring_lock - * and sq_tail and cq_head are changed by userspace. But it's ok since - * we usually use these info when it is stuck. - */ - seq_printf(m, "SqMask:\t0x%x\n", sq_mask); - seq_printf(m, "SqHead:\t%u\n", sq_head); - seq_printf(m, "SqTail:\t%u\n", sq_tail); - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); - seq_printf(m, "CqMask:\t0x%x\n", cq_mask); - seq_printf(m, "CqHead:\t%u\n", cq_head); - seq_printf(m, "CqTail:\t%u\n", cq_tail); - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); - seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); - sq_entries = min(sq_tail - sq_head, ctx->sq_entries); - for (i = 0; i < sq_entries; i++) { - unsigned int entry = i + sq_head; - unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); - struct io_uring_sqe *sqe; - - if (sq_idx > sq_mask) - continue; - sqe = &ctx->sq_sqes[sq_idx]; - seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", - sq_idx, sqe->opcode, sqe->fd, sqe->flags, - sqe->user_data); - } - seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); - cq_entries = min(cq_tail - cq_head, ctx->cq_entries); - for (i = 0; i < cq_entries; i++) { - unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - - if (!is_cqe32) { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags); - } else { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " - "extra1:%llu, extra2:%llu\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); - } - } - - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { - sq = ctx->sq_data; - if (!sq->thread) - sq = NULL; - } - - seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); - seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); - seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; has_lock && i < ctx->nr_user_files; i++) { - struct file *f = io_file_from_index(ctx, i); - - if (f) - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); - else - seq_printf(m, "%5u: <none>\n", i); - } - seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - unsigned int len = buf->ubuf_end - buf->ubuf; - - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); - } - if (has_lock && !xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - - seq_puts(m, "PollList:\n"); - spin_lock(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list = &ctx->cancel_hash[i]; - struct io_kiocb *req; - - hlist_for_each_entry(req, list, hash_node) - seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); - } - - seq_puts(m, "CqOverflowList:\n"); - list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { - struct io_uring_cqe *cqe = &ocqe->cqe; - - seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", - cqe->user_data, cqe->res, cqe->flags); - - } - - spin_unlock(&ctx->completion_lock); -} - -static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct io_ring_ctx *ctx = f->private_data; - - if (percpu_ref_tryget(&ctx->refs)) { - __io_uring_show_fdinfo(ctx, m); - percpu_ref_put(&ctx->refs); - } -} -#endif - -static const struct file_operations io_uring_fops = { - .release = io_uring_release, - .mmap = io_uring_mmap, -#ifndef CONFIG_MMU - .get_unmapped_area = io_uring_nommu_get_unmapped_area, - .mmap_capabilities = io_uring_nommu_mmap_capabilities, -#endif - .poll = io_uring_poll, -#ifdef CONFIG_PROC_FS - .show_fdinfo = io_uring_show_fdinfo, -#endif -}; - -static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) -{ - struct io_rings *rings; - size_t size, sq_array_offset; - - /* make sure these are sane, as we already accounted them */ - ctx->sq_entries = p->sq_entries; - ctx->cq_entries = p->cq_entries; - - size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); - if (size == SIZE_MAX) - return -EOVERFLOW; - - rings = io_mem_alloc(size); - if (!rings) - return -ENOMEM; - - ctx->rings = rings; - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); - rings->sq_ring_mask = p->sq_entries - 1; - rings->cq_ring_mask = p->cq_entries - 1; - rings->sq_ring_entries = p->sq_entries; - rings->cq_ring_entries = p->cq_entries; - - if (p->flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -EOVERFLOW; - } - - ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -ENOMEM; - } - - return 0; -} - -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) -{ - int ret, fd; - - fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); - if (fd < 0) - return fd; - - ret = io_uring_add_tctx_node(ctx); - if (ret) { - put_unused_fd(fd); - return ret; - } - fd_install(fd, file); - return fd; -} - -/* - * Allocate an anonymous fd, this is what constitutes the application - * visible backing of an io_uring instance. The application mmaps this - * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, - * we have to tie this fd to a socket for file garbage collection purposes. - */ -static struct file *io_uring_get_file(struct io_ring_ctx *ctx) -{ - struct file *file; -#if defined(CONFIG_UNIX) - int ret; - - ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, - &ctx->ring_sock); - if (ret) - return ERR_PTR(ret); -#endif - - file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, - O_RDWR | O_CLOEXEC, NULL); -#if defined(CONFIG_UNIX) - if (IS_ERR(file)) { - sock_release(ctx->ring_sock); - ctx->ring_sock = NULL; - } else { - ctx->ring_sock->file = file; - } -#endif - return file; -} - -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) -{ - struct io_ring_ctx *ctx; - struct file *file; - int ret; - - if (!entries) - return -EINVAL; - if (entries > IORING_MAX_ENTRIES) { - if (!(p->flags & IORING_SETUP_CLAMP)) - return -EINVAL; - entries = IORING_MAX_ENTRIES; - } - - /* - * Use twice as many entries for the CQ ring. It's possible for the - * application to drive a higher depth than the size of the SQ ring, - * since the sqes are only used at submission time. This allows for - * some flexibility in overcommitting a bit. If the application has - * set IORING_SETUP_CQSIZE, it will have passed in the desired number - * of CQ ring entries manually. - */ - p->sq_entries = roundup_pow_of_two(entries); - if (p->flags & IORING_SETUP_CQSIZE) { - /* - * If IORING_SETUP_CQSIZE is set, we do the same roundup - * to a power-of-two, if it isn't already. We do NOT impose - * any cq vs sq ring sizing. - */ - if (!p->cq_entries) - return -EINVAL; - if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { - if (!(p->flags & IORING_SETUP_CLAMP)) - return -EINVAL; - p->cq_entries = IORING_MAX_CQ_ENTRIES; - } - p->cq_entries = roundup_pow_of_two(p->cq_entries); - if (p->cq_entries < p->sq_entries) - return -EINVAL; - } else { - p->cq_entries = 2 * p->sq_entries; - } - - ctx = io_ring_ctx_alloc(p); - if (!ctx) - return -ENOMEM; - - /* - * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user - * space applications don't need to do io completion events - * polling again, they can rely on io_sq_thread to do polling - * work, which can reduce cpu usage and uring_lock contention. - */ - if (ctx->flags & IORING_SETUP_IOPOLL && - !(ctx->flags & IORING_SETUP_SQPOLL)) - ctx->syscall_iopoll = 1; - - ctx->compat = in_compat_syscall(); - if (!capable(CAP_IPC_LOCK)) - ctx->user = get_uid(current_user()); - - /* - * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if - * COOP_TASKRUN is set, then IPIs are never needed by the app. - */ - ret = -EINVAL; - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* IPI related flags don't make sense with SQPOLL */ - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | - IORING_SETUP_TASKRUN_FLAG)) - goto err; - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - goto err; - ctx->notify_method = TWA_SIGNAL; - } - - /* - * This is just grabbed for accounting purposes. When a process exits, - * the mm is exited and dropped before the files, hence we need to hang - * on to this mm purely for the purposes of being able to unaccount - * memory (locked/pinned vm). It's not used for anything else. - */ - mmgrab(current->mm); - ctx->mm_account = current->mm; - - ret = io_allocate_scq_urings(ctx, p); - if (ret) - goto err; - - ret = io_sq_offload_create(ctx, p); - if (ret) - goto err; - /* always set a rsrc node */ - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - io_rsrc_node_switch(ctx, NULL); - - memset(&p->sq_off, 0, sizeof(p->sq_off)); - p->sq_off.head = offsetof(struct io_rings, sq.head); - p->sq_off.tail = offsetof(struct io_rings, sq.tail); - p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); - p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); - p->sq_off.flags = offsetof(struct io_rings, sq_flags); - p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - - memset(&p->cq_off, 0, sizeof(p->cq_off)); - p->cq_off.head = offsetof(struct io_rings, cq.head); - p->cq_off.tail = offsetof(struct io_rings, cq.tail); - p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); - p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); - p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->cq_off.flags = offsetof(struct io_rings, cq_flags); - - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE; - - if (copy_to_user(params, p, sizeof(*p))) { - ret = -EFAULT; - goto err; - } - - file = io_uring_get_file(ctx); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto err; - } - - /* - * Install ring fd as the very last thing, so we don't risk someone - * having closed it before we finish setup - */ - ret = io_uring_install_fd(ctx, file); - if (ret < 0) { - /* fput will clean it up */ - fput(file); - return ret; - } - - trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); - return ret; -err: - io_ring_ctx_wait_and_kill(ctx); - return ret; -} - -/* - * Sets up an aio uring context, and returns the fd. Applications asks for a - * ring size, we return the actual sq/cq ring sizes (among other things) in the - * params structure passed in. - */ -static long io_uring_setup(u32 entries, struct io_uring_params __user *params) -{ - struct io_uring_params p; - int i; - - if (copy_from_user(&p, params, sizeof(p))) - return -EFAULT; - for (i = 0; i < ARRAY_SIZE(p.resv); i++) { - if (p.resv[i]) - return -EINVAL; - } - - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32)) - return -EINVAL; - - return io_uring_create(entries, &p, params); -} - -SYSCALL_DEFINE2(io_uring_setup, u32, entries, - struct io_uring_params __user *, params) -{ - return io_uring_setup(entries, params); -} - -static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_probe *p; - size_t size; - int i, ret; - - size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; - p = kzalloc(size, GFP_KERNEL); - if (!p) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(p, arg, size)) - goto out; - ret = -EINVAL; - if (memchr_inv(p, 0, size)) - goto out; - - p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; - - for (i = 0; i < nr_args; i++) { - p->ops[i].op = i; - if (!io_op_defs[i].not_supported) - p->ops[i].flags = IO_URING_OP_SUPPORTED; - } - p->ops_len = i; - - ret = 0; - if (copy_to_user(arg, p, size)) - ret = -EFAULT; -out: - kfree(p); - return ret; -} - -static int io_register_personality(struct io_ring_ctx *ctx) -{ - const struct cred *creds; - u32 id; - int ret; - - creds = get_current_cred(); - - ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, - XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); - if (ret < 0) { - put_cred(creds); - return ret; - } - return id; -} - -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) -{ - struct io_uring_restriction *res; - size_t size; - int i, ret; - - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) - return -EINVAL; - - size = array_size(nr_args, sizeof(*res)); - if (size == SIZE_MAX) - return -EOVERFLOW; - - res = memdup_user(arg, size); - if (IS_ERR(res)) - return PTR_ERR(res); - - ret = 0; - - for (i = 0; i < nr_args; i++) { - switch (res[i].opcode) { - case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); - break; - case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); - break; - case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; - break; - case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; - break; - default: - ret = -EINVAL; - goto out; - } - } - -out: - /* Reset all restrictions if an error happened */ - if (ret != 0) - memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); - else - ctx->restrictions.registered = true; - - kfree(res); - return ret; -} - -static int io_register_enable_rings(struct io_ring_ctx *ctx) -{ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - if (ctx->restrictions.registered) - ctx->restricted = 1; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; - if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); - return 0; -} - -static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, - struct io_uring_rsrc_update2 *up, - unsigned nr_args) -{ - __u32 tmp; - int err; - - if (check_add_overflow(up->offset, nr_args, &tmp)) - return -EOVERFLOW; - err = io_rsrc_node_switch_start(ctx); - if (err) - return err; - - switch (type) { - case IORING_RSRC_FILE: - return __io_sqe_files_update(ctx, up, nr_args); - case IORING_RSRC_BUFFER: - return __io_sqe_buffers_update(ctx, up, nr_args); - } - return -EINVAL; -} - -static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update2 up; - - if (!nr_args) - return -EINVAL; - memset(&up, 0, sizeof(up)); - if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) - return -EFAULT; - if (up.resv || up.resv2) - return -EINVAL; - return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); -} - -static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned size, unsigned type) -{ - struct io_uring_rsrc_update2 up; - - if (size != sizeof(up)) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (!up.nr || up.resv || up.resv2) - return -EINVAL; - return __io_register_rsrc_update(ctx, type, &up, up.nr); -} - -static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, - unsigned int size, unsigned int type) -{ - struct io_uring_rsrc_register rr; - - /* keep it extendible */ - if (size != sizeof(rr)) - return -EINVAL; - - memset(&rr, 0, sizeof(rr)); - if (copy_from_user(&rr, arg, size)) - return -EFAULT; - if (!rr.nr || rr.resv2) - return -EINVAL; - if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) - return -EINVAL; - - switch (type) { - case IORING_RSRC_FILE: - if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) - break; - return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); - case IORING_RSRC_BUFFER: - if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) - break; - return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); - } - return -EINVAL; -} - -static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, - void __user *arg, unsigned len) -{ - struct io_uring_task *tctx = current->io_uring; - cpumask_var_t new_mask; - int ret; - - if (!tctx || !tctx->io_wq) - return -EINVAL; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_clear(new_mask); - if (len > cpumask_size()) - len = cpumask_size(); - - if (in_compat_syscall()) { - ret = compat_get_bitmap(cpumask_bits(new_mask), - (const compat_ulong_t __user *)arg, - len * 8 /* CHAR_BIT */); - } else { - ret = copy_from_user(new_mask, arg, len); - } - - if (ret) { - free_cpumask_var(new_mask); - return -EFAULT; - } - - ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); - free_cpumask_var(new_mask); - return ret; -} - -static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - - if (!tctx || !tctx->io_wq) - return -EINVAL; - - return io_wq_cpu_affinity(tctx->io_wq, NULL); -} - -static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, - void __user *arg) - __must_hold(&ctx->uring_lock) -{ - struct io_tctx_node *node; - struct io_uring_task *tctx = NULL; - struct io_sq_data *sqd = NULL; - __u32 new_count[2]; - int i, ret; - - if (copy_from_user(new_count, arg, sizeof(new_count))) - return -EFAULT; - for (i = 0; i < ARRAY_SIZE(new_count); i++) - if (new_count[i] > INT_MAX) - return -EINVAL; - - if (ctx->flags & IORING_SETUP_SQPOLL) { - sqd = ctx->sq_data; - if (sqd) { - /* - * Observe the correct sqd->lock -> ctx->uring_lock - * ordering. Fine to drop uring_lock here, we hold - * a ref to the ctx. - */ - refcount_inc(&sqd->refs); - mutex_unlock(&ctx->uring_lock); - mutex_lock(&sqd->lock); - mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; - } - } else { - tctx = current->io_uring; - } - - BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); - - for (i = 0; i < ARRAY_SIZE(new_count); i++) - if (new_count[i]) - ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; - - if (tctx && tctx->io_wq) { - ret = io_wq_max_workers(tctx->io_wq, new_count); - if (ret) - goto err; - } else { - memset(new_count, 0, sizeof(new_count)); - } - - if (sqd) { - mutex_unlock(&sqd->lock); - io_put_sq_data(sqd); - } - - if (copy_to_user(arg, new_count, sizeof(new_count))) - return -EFAULT; - - /* that's it for SQPOLL, only the SQPOLL task creates requests */ - if (sqd) - return 0; - - /* now propagate the restriction to all registered users */ - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - if (WARN_ON_ONCE(!tctx->io_wq)) - continue; - - for (i = 0; i < ARRAY_SIZE(new_count); i++) - new_count[i] = ctx->iowq_limits[i]; - /* ignore errors, it always returns zero anyway */ - (void)io_wq_max_workers(tctx->io_wq, new_count); - } - return 0; -err: - if (sqd) { - mutex_unlock(&sqd->lock); - io_put_sq_data(sqd); - } - return ret; -} - -static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_ring *br; - struct io_uring_buf_reg reg; - struct io_buffer_list *bl; - struct page **pages; - int nr_pages; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - if (!is_power_of_2(reg.ring_entries)) - return -EINVAL; - - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - - bl = io_buffer_get_list(ctx, reg.bgid); - if (bl) { - /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) - return -EEXIST; - } else { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; - } - - pages = io_pin_pages(reg.ring_addr, - struct_size(br, bufs, reg.ring_entries), - &nr_pages); - if (IS_ERR(pages)) { - kfree(bl); - return PTR_ERR(pages); - } - - br = page_address(pages[0]); - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->nr_entries = reg.ring_entries; - bl->buf_ring = br; - bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; -} - -static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_reg reg; - struct io_buffer_list *bl; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - - bl = io_buffer_get_list(ctx, reg.bgid); - if (!bl) - return -ENOENT; - if (!bl->buf_nr_pages) - return -EINVAL; - - __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree(bl); - } - return 0; -} - -static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, - void __user *arg, unsigned nr_args) - __releases(ctx->uring_lock) - __acquires(ctx->uring_lock) -{ - int ret; - - /* - * We're inside the ring mutex, if the ref is already dying, then - * someone else killed the ctx or is already going through - * io_uring_register(). - */ - if (percpu_ref_is_dying(&ctx->refs)) - return -ENXIO; - - if (ctx->restricted) { - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; - opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); - if (!test_bit(opcode, ctx->restrictions.register_op)) - return -EACCES; - } - - switch (opcode) { - case IORING_REGISTER_BUFFERS: - ret = -EFAULT; - if (!arg) - break; - ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); - break; - case IORING_UNREGISTER_BUFFERS: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_sqe_buffers_unregister(ctx); - break; - case IORING_REGISTER_FILES: - ret = -EFAULT; - if (!arg) - break; - ret = io_sqe_files_register(ctx, arg, nr_args, NULL); - break; - case IORING_UNREGISTER_FILES: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_sqe_files_unregister(ctx); - break; - case IORING_REGISTER_FILES_UPDATE: - ret = io_register_files_update(ctx, arg, nr_args); - break; - case IORING_REGISTER_EVENTFD: - ret = -EINVAL; - if (nr_args != 1) - break; - ret = io_eventfd_register(ctx, arg, 0); - break; - case IORING_REGISTER_EVENTFD_ASYNC: - ret = -EINVAL; - if (nr_args != 1) - break; - ret = io_eventfd_register(ctx, arg, 1); - break; - case IORING_UNREGISTER_EVENTFD: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_eventfd_unregister(ctx); - break; - case IORING_REGISTER_PROBE: - ret = -EINVAL; - if (!arg || nr_args > 256) - break; - ret = io_probe(ctx, arg, nr_args); - break; - case IORING_REGISTER_PERSONALITY: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_register_personality(ctx); - break; - case IORING_UNREGISTER_PERSONALITY: - ret = -EINVAL; - if (arg) - break; - ret = io_unregister_personality(ctx, nr_args); - break; - case IORING_REGISTER_ENABLE_RINGS: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_register_enable_rings(ctx); - break; - case IORING_REGISTER_RESTRICTIONS: - ret = io_register_restrictions(ctx, arg, nr_args); - break; - case IORING_REGISTER_FILES2: - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); - break; - case IORING_REGISTER_FILES_UPDATE2: - ret = io_register_rsrc_update(ctx, arg, nr_args, - IORING_RSRC_FILE); - break; - case IORING_REGISTER_BUFFERS2: - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); - break; - case IORING_REGISTER_BUFFERS_UPDATE: - ret = io_register_rsrc_update(ctx, arg, nr_args, - IORING_RSRC_BUFFER); - break; - case IORING_REGISTER_IOWQ_AFF: - ret = -EINVAL; - if (!arg || !nr_args) - break; - ret = io_register_iowq_aff(ctx, arg, nr_args); - break; - case IORING_UNREGISTER_IOWQ_AFF: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_unregister_iowq_aff(ctx); - break; - case IORING_REGISTER_IOWQ_MAX_WORKERS: - ret = -EINVAL; - if (!arg || nr_args != 2) - break; - ret = io_register_iowq_max_workers(ctx, arg); - break; - case IORING_REGISTER_RING_FDS: - ret = io_ringfd_register(ctx, arg, nr_args); - break; - case IORING_UNREGISTER_RING_FDS: - ret = io_ringfd_unregister(ctx, arg, nr_args); - break; - case IORING_REGISTER_PBUF_RING: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_register_pbuf_ring(ctx, arg); - break; - case IORING_UNREGISTER_PBUF_RING: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_unregister_pbuf_ring(ctx, arg); - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, - void __user *, arg, unsigned int, nr_args) -{ - struct io_ring_ctx *ctx; - long ret = -EBADF; - struct fd f; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - ret = -EOPNOTSUPP; - if (f.file->f_op != &io_uring_fops) - goto out_fput; - - ctx = f.file->private_data; - - io_run_task_work(); - - mutex_lock(&ctx->uring_lock); - ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); -out_fput: - fdput(f); - return ret; -} - -static int __init io_uring_init(void) -{ -#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ - BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ - BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ -} while (0) - -#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ - __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) - BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); - BUILD_BUG_SQE_ELEM(0, __u8, opcode); - BUILD_BUG_SQE_ELEM(1, __u8, flags); - BUILD_BUG_SQE_ELEM(2, __u16, ioprio); - BUILD_BUG_SQE_ELEM(4, __s32, fd); - BUILD_BUG_SQE_ELEM(8, __u64, off); - BUILD_BUG_SQE_ELEM(8, __u64, addr2); - BUILD_BUG_SQE_ELEM(16, __u64, addr); - BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); - BUILD_BUG_SQE_ELEM(24, __u32, len); - BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); - BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); - BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); - BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); - BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); - BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); - BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); - BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); - BUILD_BUG_SQE_ELEM(28, __u32, open_flags); - BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); - BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); - BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); - BUILD_BUG_SQE_ELEM(32, __u64, user_data); - BUILD_BUG_SQE_ELEM(40, __u16, buf_index); - BUILD_BUG_SQE_ELEM(40, __u16, buf_group); - BUILD_BUG_SQE_ELEM(42, __u16, personality); - BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); - BUILD_BUG_SQE_ELEM(44, __u32, file_index); - BUILD_BUG_SQE_ELEM(48, __u64, addr3); - - BUILD_BUG_ON(sizeof(struct io_uring_files_update) != - sizeof(struct io_uring_rsrc_update)); - BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > - sizeof(struct io_uring_rsrc_update2)); - - /* ->buf_index is u16 */ - BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); - BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); - BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != - offsetof(struct io_uring_buf_ring, tail)); - - /* should fit into one byte */ - BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); - BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); - BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - - BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); - BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); - - BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); - - BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); - - req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT); - return 0; -}; -__initcall(io_uring_init); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d2a9f699e17e..2b82c7f1de88 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio) static struct bio_set iomap_ioend_bioset; static struct iomap_page * -iomap_page_create(struct inode *inode, struct folio *folio) +iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) { struct iomap_page *iop = to_iomap_page(folio); unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + gfp_t gfp; if (iop || nr_blocks <= 1) return iop; + if (flags & IOMAP_NOWAIT) + gfp = GFP_NOWAIT; + else + gfp = GFP_NOFS | __GFP_NOFAIL; + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), - GFP_NOFS | __GFP_NOFAIL); - spin_lock_init(&iop->uptodate_lock); - if (folio_test_uptodate(folio)) - bitmap_fill(iop->uptodate, nr_blocks); - folio_attach_private(folio, iop); + gfp); + if (iop) { + spin_lock_init(&iop->uptodate_lock); + if (folio_test_uptodate(folio)) + bitmap_fill(iop->uptodate, nr_blocks); + folio_attach_private(folio, iop); + } return iop; } @@ -154,9 +162,6 @@ static void iomap_iop_set_range_uptodate(struct folio *folio, static void iomap_set_range_uptodate(struct folio *folio, struct iomap_page *iop, size_t off, size_t len) { - if (folio_test_error(folio)) - return; - if (iop) iomap_iop_set_range_uptodate(folio, iop, off, len); else @@ -226,7 +231,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) - iop = iomap_page_create(iter->inode, folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); else iop = to_iomap_page(folio); @@ -264,7 +269,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; @@ -492,31 +497,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); -#ifdef CONFIG_MIGRATION -int -iomap_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - struct folio *folio = page_folio(page); - struct folio *newfolio = page_folio(newpage); - int ret; - - ret = folio_migrate_mapping(mapping, newfolio, folio, 0); - if (ret != MIGRATEPAGE_SUCCESS) - return ret; - - if (folio_test_private(folio)) - folio_attach_private(newfolio, folio_detach_private(folio)); - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(newfolio, folio); - else - folio_migrate_flags(newfolio, folio); - return MIGRATEPAGE_SUCCESS; -} -EXPORT_SYMBOL_GPL(iomap_migrate_page); -#endif /* CONFIG_MIGRATION */ - static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -547,10 +527,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop = iomap_page_create(iter->inode, folio); + struct iomap_page *iop; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); + unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; @@ -558,6 +539,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return 0; folio_clear_error(folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); + if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) + return -EAGAIN; + do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); @@ -574,7 +559,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return -EIO; folio_zero_segments(folio, poff, from, to, poff + plen); } else { - int status = iomap_read_folio_sync(block_start, folio, + int status; + + if (iter->flags & IOMAP_NOWAIT) + return -EAGAIN; + + status = iomap_read_folio_sync(block_start, folio, poff, plen, srcmap); if (status) return status; @@ -603,6 +593,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; + if (iter->flags & IOMAP_NOWAIT) + fgp |= FGP_NOWAIT; + BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); @@ -622,7 +615,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); if (!folio) { - status = -ENOMEM; + status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } if (pos + len > folio_pos(folio) + folio_size(folio)) @@ -740,6 +733,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) loff_t pos = iter->pos; ssize_t written = 0; long status = 0; + struct address_space *mapping = iter->inode->i_mapping; + unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { struct folio *folio; @@ -752,6 +747,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_count(i)); again: + status = balance_dirty_pages_ratelimited_flags(mapping, + bdp_flags); + if (unlikely(status)) + break; + if (bytes > length) bytes = length; @@ -760,6 +760,10 @@ again: * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. + * + * For async buffered writes the assumption is that the user + * page has already been faulted in. This can be optimized by + * faulting the user page. */ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; @@ -771,7 +775,7 @@ again: break; page = folio_file_page(folio, pos >> PAGE_SHIFT); - if (mapping_writably_mapped(iter->inode->i_mapping)) + if (mapping_writably_mapped(mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); @@ -796,10 +800,12 @@ again: pos += status; written += status; length -= status; - - balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (iov_iter_count(i) && length); + if (status == -EAGAIN) { + iov_iter_revert(i, written); + return -EAGAIN; + } return written ? written : status; } @@ -815,6 +821,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, }; int ret; + if (iocb->ki_flags & IOCB_NOWAIT) + iter.flags |= IOMAP_NOWAIT; + while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); if (iter.pos == iocb->ki_pos) @@ -917,10 +926,10 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) pos += bytes; length -= bytes; written += bytes; - if (did_zero) - *did_zero = true; } while (length > 0); + if (did_zero) + *did_zero = true; return written; } @@ -1329,7 +1338,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, folio); + struct iomap_page *iop = iomap_page_create(inode, folio, 0); struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); unsigned nblocks = i_blocks_per_folio(inode, folio); @@ -1478,10 +1487,10 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) pgoff_t end_index = isize >> PAGE_SHIFT; /* - * Skip the page if it's fully outside i_size, e.g. due to a - * truncate operation that's in progress. We must redirty the - * page so that reclaim stops reclaiming it. Otherwise - * iomap_release_folio() is called on it and gets confused. + * Skip the page if it's fully outside i_size, e.g. + * due to a truncate operation that's in progress. We've + * cleaned this page and truncate will finish things off for + * us. * * Note that the end_index is unsigned long. If the given * offset is greater than 16TB on a 32-bit system then if we @@ -1496,7 +1505,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) */ if (folio->index > end_index || (folio->index == end_index && poff == 0)) - goto redirty; + goto unlock; /* * The page straddles i_size. It must be zeroed out on each @@ -1514,6 +1523,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) redirty: folio_redirty_for_writepage(wbc, folio); +unlock: folio_unlock(folio); return 0; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 370c3241618a..c75d33d5c3ce 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -52,7 +52,7 @@ struct iomap_dio { }; static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, - struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf) + struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf) { if (dio->dops && dio->dops->bio_set) return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf, @@ -212,10 +212,10 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * mapping, and whether or not we want FUA. Note that we can end up * clearing the WRITE_FUA flag in the dio request. */ -static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio, +static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, const struct iomap *iomap, bool use_fua) { - unsigned int opflags = REQ_SYNC | REQ_IDLE; + blk_opf_t opflags = REQ_SYNC | REQ_IDLE; if (!(dio->flags & IOMAP_DIO_WRITE)) { WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); @@ -242,10 +242,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, struct inode *inode = iter->inode; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; - unsigned int align = iov_iter_alignment(dio->submit.iter); loff_t length = iomap_length(iter); loff_t pos = iter->pos; - unsigned int bio_opf; + blk_opf_t bio_opf; struct bio *bio; bool need_zeroout = false; bool use_fua = false; @@ -253,7 +252,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; - if ((pos | length | align) & ((1 << blkbits) - 1)) + if ((pos | length) & ((1 << blkbits) - 1) || + !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; if (iomap->type == IOMAP_UNWRITTEN) { @@ -548,17 +548,18 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* for data sync or sync, we need sync completion processing */ - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) { dio->flags |= IOMAP_DIO_NEED_SYNC; - /* - * For datasync only writes, we optimistically try using FUA for - * this IO. Any non-FUA write that occurs will clear this flag, - * hence we know before completion whether a cache flush is - * necessary. - */ - if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) - dio->flags |= IOMAP_DIO_WRITE_FUA; + /* + * For datasync only writes, we optimistically try + * using FUA for this IO. Any non-FUA write that + * occurs will clear this flag, hence we know before + * completion whether a cache flush is necessary. + */ + if (!(iocb->ki_flags & IOCB_SYNC)) + dio->flags |= IOMAP_DIO_WRITE_FUA; + } } if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 95a19f25d61c..b466172eec25 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -82,7 +82,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, return 0; } haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); - ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs); + ll_rw_block(REQ_OP_READ, haveblocks, bhs); curbh = 0; curpage = 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index eb315e81f1a6..890b5543a1c5 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -155,10 +155,10 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh); + ret = submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | + REQ_FUA, bh); else - ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + ret = submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); *cbh = bh; return ret; @@ -763,7 +763,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); } cond_resched(); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c0cbeeaec2d1..2a1b9da7c3e3 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1465,7 +1465,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, if (!journal) return NULL; - bdevname(journal->j_dev, journal->j_devname); + snprintf(journal->j_devname, sizeof(journal->j_devname), + "%pg", journal->j_dev); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); @@ -1507,7 +1508,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return NULL; journal->j_inode = inode; - bdevname(journal->j_dev, journal->j_devname); + snprintf(journal->j_devname, sizeof(journal->j_devname), + "%pg", journal->j_dev); p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); jbd2_stats_proc_init(journal); @@ -1602,7 +1604,7 @@ static int journal_reset(journal_t *journal) * This function expects that the caller will have locked the journal * buffer head, and will return with it unlocked */ -static int jbd2_write_superblock(journal_t *journal, int write_flags) +static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; @@ -1636,7 +1638,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE, write_flags, bh); + ret = submit_bh(REQ_OP_WRITE | write_flags, bh); wait_on_buffer(bh); if (buffer_write_io_error(bh)) { clear_buffer_write_io_error(bh); @@ -1659,13 +1661,14 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) * @journal: The journal to update. * @tail_tid: TID of the new transaction at the tail of the log * @tail_block: The first block of the transaction at the tail of the log - * @write_op: With which operation should we write the journal sb + * @write_flags: Flags for the journal sb write operation * * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, - unsigned long tail_block, int write_op) + unsigned long tail_block, + blk_opf_t write_flags) { journal_superblock_t *sb = journal->j_superblock; int ret; @@ -1685,7 +1688,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); - ret = jbd2_write_superblock(journal, write_op); + ret = jbd2_write_superblock(journal, write_flags); if (ret) goto out; @@ -1702,12 +1705,12 @@ out: /** * jbd2_mark_journal_empty() - Mark on disk journal as empty. * @journal: The journal to update. - * @write_op: With which operation should we write the journal sb + * @write_flags: Flags for the journal sb write operation * * Update a journal's dynamic superblock fields to show that journal is empty. * Write updated superblock to disk waiting for IO to complete. */ -static void jbd2_mark_journal_empty(journal_t *journal, int write_op) +static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) { journal_superblock_t *sb = journal->j_superblock; bool had_fast_commit = false; @@ -1733,7 +1736,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) had_fast_commit = true; } - jbd2_write_superblock(journal, write_op); + jbd2_write_superblock(journal, write_flags); if (had_fast_commit) jbd2_set_feature_fast_commit(journal); @@ -1898,7 +1901,7 @@ static int journal_get_superblock(journal_t *journal) J_ASSERT(bh != NULL); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { printk(KERN_ERR diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 8ca3527189f8..e699d6ab2c0e 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start) if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; if (nbufs == MAXBUF) { - ll_rw_block(REQ_OP_READ, 0, nbufs, bufs); + ll_rw_block(REQ_OP_READ, nbufs, bufs); journal_brelse_array(bufs, nbufs); nbufs = 0; } @@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start) } if (nbufs) - ll_rw_block(REQ_OP_READ, 0, nbufs, bufs); + ll_rw_block(REQ_OP_READ, nbufs, bufs); err = 0; failed: diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e49bb0938376..e9c308ae475f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2114,7 +2114,7 @@ out: /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation - * @page: to try and free + * @folio: Folio to detach data from. * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 1d732fd223d4..332dc9ac47a9 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (rc) return rc; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - rc = dquot_transfer(inode, iattr); + rc = dquot_transfer(mnt_userns, inode, iattr); if (rc) return rc; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 259326556ada..d1ec920aa030 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -301,13 +301,25 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, { int ret; - ret = nobh_write_begin(mapping, pos, len, pagep, fsdata, jfs_get_block); + ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block); if (unlikely(ret)) jfs_write_failed(mapping, pos + len); return ret; } +static int jfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + jfs_write_failed(mapping, pos + len); + return ret; +} + static sector_t jfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, jfs_get_block); @@ -346,7 +358,7 @@ const struct address_space_operations jfs_aops = { .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, - .write_end = nobh_write_end, + .write_end = jfs_write_end, .bmap = jfs_bmap, .direct_IO = jfs_direct_IO, }; @@ -399,7 +411,7 @@ void jfs_truncate(struct inode *ip) { jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size); - nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); + block_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); IWRITE_LOCK(ip, RDWRLOCK_NORMAL); jfs_truncate_nolock(ip, ip->i_size); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 387652ae14c2..2e8461ce74de 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -618,7 +618,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, SetPageUptodate(page); } else { page = read_mapping_page(mapping, page_index, NULL); - if (IS_ERR(page) || !PageUptodate(page)) { + if (IS_ERR(page)) { jfs_err("read_mapping_page failed!"); return NULL; } diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index e6f4ccc12f49..353f047e783c 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6490,6 +6490,7 @@ int smb2_write(struct ksmbd_work *work) goto out; } + ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) writethrough = true; @@ -6505,10 +6506,6 @@ int smb2_write(struct ksmbd_work *work) data_buf = (char *)(((char *)&req->hdr.ProtocolId) + le16_to_cpu(req->DataOffset)); - ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); - if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) - writethrough = true; - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", fp->filp->f_path.dentry, offset, length); err = ksmbd_vfs_write(work, fp, data_buf, length, &offset, @@ -7703,7 +7700,7 @@ int smb2_ioctl(struct ksmbd_work *work) { struct file_zero_data_information *zero_data; struct ksmbd_file *fp; - loff_t off, len; + loff_t off, len, bfz; if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, @@ -7720,19 +7717,26 @@ int smb2_ioctl(struct ksmbd_work *work) zero_data = (struct file_zero_data_information *)&req->Buffer[0]; - fp = ksmbd_lookup_fd_fast(work, id); - if (!fp) { - ret = -ENOENT; + off = le64_to_cpu(zero_data->FileOffset); + bfz = le64_to_cpu(zero_data->BeyondFinalZero); + if (off > bfz) { + ret = -EINVAL; goto out; } - off = le64_to_cpu(zero_data->FileOffset); - len = le64_to_cpu(zero_data->BeyondFinalZero) - off; + len = bfz - off; + if (len) { + fp = ksmbd_lookup_fd_fast(work, id); + if (!fp) { + ret = -ENOENT; + goto out; + } - ret = ksmbd_vfs_zero_data(work, fp, off, len); - ksmbd_fd_put(work, fp); - if (ret < 0) - goto out; + ret = ksmbd_vfs_zero_data(work, fp, off, len); + ksmbd_fd_put(work, fp); + if (ret < 0) + goto out; + } break; } case FSCTL_QUERY_ALLOCATED_RANGES: @@ -7806,14 +7810,24 @@ int smb2_ioctl(struct ksmbd_work *work) src_off = le64_to_cpu(dup_ext->SourceFileOffset); dst_off = le64_to_cpu(dup_ext->TargetFileOffset); length = le64_to_cpu(dup_ext->ByteCount); - cloned = vfs_clone_file_range(fp_in->filp, src_off, fp_out->filp, - dst_off, length, 0); + /* + * XXX: It is not clear if FSCTL_DUPLICATE_EXTENTS_TO_FILE + * should fall back to vfs_copy_file_range(). This could be + * beneficial when re-exporting nfs/smb mount, but note that + * this can result in partial copy that returns an error status. + * If/when FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX is implemented, + * fall back to vfs_copy_file_range(), should be avoided when + * the flag DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC is set. + */ + cloned = vfs_clone_file_range(fp_in->filp, src_off, + fp_out->filp, dst_off, length, 0); if (cloned == -EXDEV || cloned == -EOPNOTSUPP) { ret = -EOPNOTSUPP; goto dup_ext_out; } else if (cloned != length) { cloned = vfs_copy_file_range(fp_in->filp, src_off, - fp_out->filp, dst_off, length, 0); + fp_out->filp, dst_off, + length, 0); if (cloned != length) { if (cloned < 0) ret = cloned; diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index d035e060c2f0..35b55ee94fe5 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -5,16 +5,6 @@ * * Author(s): Long Li <longli@microsoft.com>, * Hyunchul Lee <hyc.lee@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define SUBMOD_NAME "smb_direct" diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 8fef9de787d3..143bba4e4db8 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -230,7 +230,7 @@ static int ksmbd_kthread_fn(void *p) break; } ret = kernel_accept(iface->ksmbd_socket, &client_sk, - O_NONBLOCK); + SOCK_NONBLOCK); mutex_unlock(&iface->sock_release_lock); if (ret) { if (ret == -EAGAIN) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index dcdd07c6efff..7c849024999f 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -963,7 +963,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, */ int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags) + void *attr_value, size_t attr_size, int flags) { int err; @@ -1015,7 +1015,9 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, len); - return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len); + return vfs_fallocate(fp->filp, + FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, + off, len); } int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, @@ -1046,7 +1048,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, *out_count = 0; end = start + length; while (start < end && *out_count < in_count) { - extent_start = f->f_op->llseek(f, start, SEEK_DATA); + extent_start = vfs_llseek(f, start, SEEK_DATA); if (extent_start < 0) { if (extent_start != -ENXIO) ret = (int)extent_start; @@ -1056,7 +1058,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, if (extent_start >= end) break; - extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE); + extent_end = vfs_llseek(f, extent_start, SEEK_HOLE); if (extent_end < 0) { if (extent_end != -ENXIO) ret = (int)extent_end; @@ -1777,6 +1779,10 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, ret = vfs_copy_file_range(src_fp->filp, src_off, dst_fp->filp, dst_off, len, 0); + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src_fp->filp, src_off, + dst_fp->filp, dst_off, + len, 0); if (ret < 0) return ret; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 8c37aaf936ab..70da4c0ba7ad 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -109,7 +109,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, int attr_name_len); int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags); + void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 0a22a2faf552..e1c4617de771 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file) } } -static int nlm_unlock_files(struct nlm_file *file) +static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner) { struct file_lock lock; @@ -184,6 +184,7 @@ static int nlm_unlock_files(struct nlm_file *file) lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; + lock.fl_owner = owner; if (file->f_file[O_RDONLY] && vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) goto out_err; @@ -225,7 +226,7 @@ again: if (match(lockhost, host)) { spin_unlock(&flctx->flc_lock); - if (nlm_unlock_files(file)) + if (nlm_unlock_files(file, fl->fl_owner)) return 1; goto again; } @@ -282,11 +283,10 @@ nlm_file_inuse(struct nlm_file *file) static void nlm_close_files(struct nlm_file *file) { - struct file *f; - - for (f = file->f_file[0]; f <= file->f_file[1]; f++) - if (f) - nlmsvc_ops->fclose(f); + if (file->f_file[O_RDONLY]) + nlmsvc_ops->fclose(file->f_file[O_RDONLY]); + if (file->f_file[O_WRONLY]) + nlmsvc_ops->fclose(file->f_file[O_WRONLY]); } /* diff --git a/fs/locks.c b/fs/locks.c index ca28e0e50e56..c266cfdc3291 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -425,21 +425,9 @@ static inline int flock_translate_cmd(int cmd) { } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ -static struct file_lock * -flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) +static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { - int type = flock_translate_cmd(cmd); - - if (type < 0) - return ERR_PTR(type); - - if (fl == NULL) { - fl = locks_alloc_lock(); - if (fl == NULL) - return ERR_PTR(-ENOMEM); - } else { - locks_init_lock(fl); - } + locks_init_lock(fl); fl->fl_file = filp; fl->fl_owner = filp; @@ -447,8 +435,6 @@ flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; - - return fl; } static int assign_type(struct file_lock *fl, long type) @@ -2097,21 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { - struct fd f = fdget(fd); - struct file_lock *lock; - int can_sleep, unlock; - int error; - - error = -EBADF; - if (!f.file) - goto out; - - can_sleep = !(cmd & LOCK_NB); - cmd &= ~LOCK_NB; - unlock = (cmd == LOCK_UN); - - if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) - goto out_putf; + int can_sleep, error, type; + struct file_lock fl; + struct fd f; /* * LOCK_MAND locks were broken for a long time in that they never @@ -2123,36 +2097,41 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ if (cmd & LOCK_MAND) { pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); - error = 0; - goto out_putf; + return 0; } - lock = flock_make_lock(f.file, cmd, NULL); - if (IS_ERR(lock)) { - error = PTR_ERR(lock); + type = flock_translate_cmd(cmd & ~LOCK_NB); + if (type < 0) + return type; + + error = -EBADF; + f = fdget(fd); + if (!f.file) + return error; + + if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE))) goto out_putf; - } - if (can_sleep) - lock->fl_flags |= FL_SLEEP; + flock_make_lock(f.file, &fl, type); - error = security_file_lock(f.file, lock->fl_type); + error = security_file_lock(f.file, fl.fl_type); if (error) - goto out_free; + goto out_putf; + + can_sleep = !(cmd & LOCK_NB); + if (can_sleep) + fl.fl_flags |= FL_SLEEP; if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, - (can_sleep) ? F_SETLKW : F_SETLK, - lock); + (can_sleep) ? F_SETLKW : F_SETLK, + &fl); else - error = locks_lock_file_wait(f.file, lock); - - out_free: - locks_free_lock(lock); + error = locks_lock_file_wait(f.file, &fl); out_putf: fdput(f); - out: + return error; } @@ -2614,7 +2593,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) if (list_empty(&flctx->flc_flock)) return; - flock_make_lock(filp, LOCK_UN, &fl); + flock_make_lock(filp, &fl, F_UNLCK); fl.fl_flags |= FL_CLOSE; if (filp->f_op->flock) diff --git a/fs/mount.h b/fs/mount.h index 0b6e08cf8afb..130c07c2f8d2 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -100,7 +100,6 @@ static inline int is_mounted(struct vfsmount *mnt) extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); -extern bool legitimize_mnt(struct vfsmount *, unsigned); static inline bool __path_is_mountpoint(const struct path *path) { diff --git a/fs/mpage.c b/fs/mpage.c index 0d25f44f5707..0f8ae954a579 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -75,26 +75,28 @@ static struct bio *mpage_bio_submit(struct bio *bio) * them. So when the buffer is up to date and the page size == block size, * this marks the page up to date instead of adding new buffers. */ -static void -map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) +static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, + int page_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct buffer_head *page_bh, *head; int block = 0; - if (!page_has_buffers(page)) { + head = folio_buffers(folio); + if (!head) { /* * don't make any buffers if there is only one buffer on - * the page and the page just needs to be set up to date + * the folio and the folio just needs to be set up to date */ if (inode->i_blkbits == PAGE_SHIFT && buffer_uptodate(bh)) { - SetPageUptodate(page); + folio_mark_uptodate(folio); return; } - create_empty_buffers(page, i_blocksize(inode), 0); + create_empty_buffers(&folio->page, i_blocksize(inode), 0); + head = folio_buffers(folio); } - head = page_buffers(page); + page_bh = head; do { if (block == page_block) { @@ -110,7 +112,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) struct mpage_readpage_args { struct bio *bio; - struct page *page; + struct folio *folio; unsigned int nr_pages; bool is_readahead; sector_t last_block_in_bio; @@ -130,8 +132,8 @@ struct mpage_readpage_args { */ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) { - struct page *page = args->page; - struct inode *inode = page->mapping->host; + struct folio *folio = args->folio; + struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -145,20 +147,23 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct block_device *bdev = NULL; int length; int fully_mapped = 1; - int op = REQ_OP_READ; + blk_opf_t opf = REQ_OP_READ; unsigned nblocks; unsigned relative_block; - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); + + /* MAX_BUF_PER_PAGE, for example */ + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); if (args->is_readahead) { - op |= REQ_RAHEAD; + opf |= REQ_RAHEAD; gfp |= __GFP_NORETRY | __GFP_NOWARN; } - if (page_has_buffers(page)) + if (folio_buffers(folio)) goto confused; - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + args->nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) @@ -191,9 +196,9 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) } /* - * Then do more get_blocks calls until we are done with this page. + * Then do more get_blocks calls until we are done with this folio. */ - map_bh->b_page = page; + map_bh->b_page = &folio->page; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; @@ -216,12 +221,12 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to - * read it again. map_buffer_to_page copies the data - * we just collected from get_block into the page's buffers - * so readpage doesn't have to repeat the get_block call + * read it again. map_buffer_to_folio copies the data + * we just collected from get_block into the folio's buffers + * so read_folio doesn't have to repeat the get_block call */ if (buffer_uptodate(map_bh)) { - map_buffer_to_page(page, map_bh, page_block); + map_buffer_to_folio(folio, map_bh, page_block); goto confused; } @@ -246,18 +251,18 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) } if (first_hole != blocks_per_page) { - zero_user_segment(page, first_hole << blkbits, PAGE_SIZE); + folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); if (first_hole == 0) { - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); goto out; } } else if (fully_mapped) { - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); } /* - * This page will go to BIO. Do we need to send this BIO off first? + * This folio will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) args->bio = mpage_bio_submit(args->bio); @@ -266,10 +271,10 @@ alloc_new: if (args->bio == NULL) { if (first_hole == blocks_per_page) { if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), - page)) + &folio->page)) goto out; } - args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), op, + args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) goto confused; @@ -277,7 +282,7 @@ alloc_new: } length = first_hole << blkbits; - if (bio_add_page(args->bio, page, length, 0) < length) { + if (!bio_add_folio(args->bio, folio, length, 0)) { args->bio = mpage_bio_submit(args->bio); goto alloc_new; } @@ -295,10 +300,10 @@ out: confused: if (args->bio) args->bio = mpage_bio_submit(args->bio); - if (!PageUptodate(page)) - block_read_full_folio(page_folio(page), args->get_block); + if (!folio_test_uptodate(folio)) + block_read_full_folio(folio, args->get_block); else - unlock_page(page); + folio_unlock(folio); goto out; } @@ -343,18 +348,17 @@ confused: */ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { - struct page *page; + struct folio *folio; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; - while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - args.page = page; + while ((folio = readahead_folio(rac))) { + prefetchw(&folio->flags); + args.folio = folio; args.nr_pages = readahead_count(rac); args.bio = do_mpage_readpage(&args); - put_page(page); } if (args.bio) mpage_bio_submit(args.bio); @@ -367,13 +371,11 @@ EXPORT_SYMBOL(mpage_readahead); int mpage_read_folio(struct folio *folio, get_block_t get_block) { struct mpage_readpage_args args = { - .page = &folio->page, + .folio = folio, .nr_pages = 1, .get_block = get_block, }; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - args.bio = do_mpage_readpage(&args); if (args.bio) mpage_bio_submit(args.bio); @@ -402,7 +404,6 @@ struct mpage_data { struct bio *bio; sector_t last_block_in_bio; get_block_t *get_block; - unsigned use_writepage; }; /* @@ -622,15 +623,10 @@ confused: if (bio) bio = mpage_bio_submit(bio); - if (mpd->use_writepage) { - ret = mapping->a_ops->writepage(page, wbc); - } else { - ret = -EAGAIN; - goto out; - } /* * The caller has a ref on the inode, so *mapping is stable */ + ret = block_write_full_page(page, mpd->get_block, wbc); mapping_set_error(mapping, ret); out: mpd->bio = bio; @@ -642,8 +638,6 @@ out: * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. - * If this is NULL then use a_ops->writepage. Otherwise, go - * direct-to-BIO. * * This is a library function, which implements the writepages() * address_space_operation. @@ -660,42 +654,17 @@ int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block) { + struct mpage_data mpd = { + .get_block = get_block, + }; struct blk_plug plug; int ret; blk_start_plug(&plug); - - if (!get_block) - ret = generic_writepages(mapping, wbc); - else { - struct mpage_data mpd = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = get_block, - .use_writepage = 1, - }; - - ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(mpd.bio); - } - blk_finish_plug(&plug); - return ret; -} -EXPORT_SYMBOL(mpage_writepages); - -int mpage_writepage(struct page *page, get_block_t get_block, - struct writeback_control *wbc) -{ - struct mpage_data mpd = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = get_block, - .use_writepage = 0, - }; - int ret = __mpage_writepage(page, wbc, &mpd); + ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) mpage_bio_submit(mpd.bio); + blk_finish_plug(&plug); return ret; } -EXPORT_SYMBOL(mpage_writepage); +EXPORT_SYMBOL(mpage_writepages); diff --git a/fs/namei.c b/fs/namei.c index 1f28d3f463c3..ed3ffd9b22a3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -567,7 +567,7 @@ struct nameidata { struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; - unsigned seq, m_seq, r_seq; + unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; @@ -665,6 +665,13 @@ static void drop_links(struct nameidata *nd) } } +static void leave_rcu(struct nameidata *nd) +{ + nd->flags &= ~LOOKUP_RCU; + nd->seq = nd->next_seq = 0; + rcu_read_unlock(); +} + static void terminate_walk(struct nameidata *nd) { drop_links(nd); @@ -678,8 +685,7 @@ static void terminate_walk(struct nameidata *nd) nd->state &= ~ND_ROOT_GRABBED; } } else { - nd->flags &= ~LOOKUP_RCU; - rcu_read_unlock(); + leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; @@ -765,14 +771,13 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; - rcu_read_unlock(); + leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; @@ -780,7 +785,7 @@ out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: - rcu_read_unlock(); + leave_rcu(nd); return false; } @@ -788,7 +793,6 @@ out: * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into - * @seq: seq number to check @dentry against * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already @@ -797,15 +801,19 @@ out: * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ -static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { + int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out2; - if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) - goto out2; + res = __legitimize_mnt(nd->path.mnt, nd->m_seq); + if (unlikely(res)) { + if (res > 0) + goto out2; + goto out1; + } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; @@ -818,7 +826,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is @@ -826,7 +834,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!legitimize_root(nd))) goto out_dput; - rcu_read_unlock(); + leave_rcu(nd); return true; out2: @@ -834,10 +842,10 @@ out2: out1: nd->path.dentry = NULL; out: - rcu_read_unlock(); + leave_rcu(nd); return false; out_dput: - rcu_read_unlock(); + leave_rcu(nd); dput(dentry); return false; } @@ -962,7 +970,7 @@ static int nd_jump_root(struct nameidata *nd) d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; - if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); @@ -1466,8 +1474,7 @@ EXPORT_SYMBOL(follow_down); * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ -static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode, unsigned *seqp) +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; @@ -1496,15 +1503,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; - *seqp = read_seqcount_begin(&dentry->d_seq); - *inode = dentry->d_inode; - /* - * We don't need to re-check ->d_seq after this - * ->d_inode read - there will be an RCU delay - * between mount hash removal and ->mnt_root - * becoming unpinned. - */ + nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; + // makes sure that non-RCU pathwalk could reach + // this state. + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) @@ -1515,8 +1519,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct inode **inode, - unsigned int *seqp) + struct path *path) { bool jumped; int ret; @@ -1524,16 +1527,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { - unsigned int seq = *seqp; - if (unlikely(!*inode)) - return -ENOENT; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + unsigned int seq = nd->next_seq; + if (likely(__follow_mount_rcu(nd, path))) return 0; - if (!try_to_unlazy_next(nd, dentry, seq)) - return -ECHILD; - // *path might've been clobbered by __follow_mount_rcu() + // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; + nd->next_seq = seq; + if (!try_to_unlazy_next(nd, dentry)) + return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { @@ -1546,9 +1548,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); - } else { - *inode = d_backing_inode(path->dentry); - *seqp = 0; /* out of RCU mode, so the value doesn't matter */ } return ret; } @@ -1607,9 +1606,7 @@ static struct dentry *__lookup_hash(const struct qstr *name, return dentry; } -static struct dentry *lookup_fast(struct nameidata *nd, - struct inode **inode, - unsigned *seqp) +static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; @@ -1620,8 +1617,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { - unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq); + dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); @@ -1629,28 +1625,16 @@ static struct dentry *lookup_fast(struct nameidata *nd, } /* - * This sequence count validates that the inode matches - * the dentry name information from lookup. - */ - *inode = d_backing_inode(dentry); - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) - return ERR_PTR(-ECHILD); - - /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. - * - * The memory barrier in read_seqcount_begin of child is - * enough, we can use __read_seqcount_retry here. */ - if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) + if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); - *seqp = seq; status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; - if (!try_to_unlazy_next(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ @@ -1731,7 +1715,7 @@ static inline int may_lookup(struct user_namespace *mnt_userns, return inode_permission(mnt_userns, nd->inode, MAY_EXEC); } -static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) +static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; @@ -1746,7 +1730,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it - bool grabbed_link = legitimize_path(nd, link, seq); + bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; @@ -1760,11 +1744,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, - struct inode *inode, unsigned seq, int flags) + struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link, seq); + int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) @@ -1774,7 +1758,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link, last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); - last->seq = seq; + last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); @@ -1836,43 +1820,50 @@ all_done: // pure jump * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. + * + * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, - struct dentry *dentry, struct inode *inode, unsigned seq) + struct dentry *dentry) { struct path path; - int err = handle_mounts(nd, dentry, &path, &inode, &seq); + struct inode *inode; + int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); + inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - if (!(nd->flags & LOOKUP_RCU)) { + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; - nd->seq = seq; + nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, seq)) + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } - return pick_link(nd, &path, inode, seq, flags); + return pick_link(nd, &path, inode, flags); } -static struct dentry *follow_dotdot_rcu(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) +static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; @@ -1889,30 +1880,30 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + // makes sure that non-RCU pathwalk could reach this state + if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; - *inodep = parent->d_inode; - *seqp = read_seqcount_begin(&parent->d_seq); - if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + nd->next_seq = read_seqcount_begin(&parent->d_seq); + // makes sure that non-RCU pathwalk could reach this state + if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); - return NULL; + nd->next_seq = nd->seq; + return nd->path.dentry; } -static struct dentry *follow_dotdot(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) +static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; @@ -1936,15 +1927,12 @@ static struct dentry *follow_dotdot(struct nameidata *nd, dput(parent); return ERR_PTR(-ENOENT); } - *seqp = 0; - *inodep = parent->d_inode; return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); - dget(nd->path.dentry); - return NULL; + return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) @@ -1952,8 +1940,6 @@ static const char *handle_dots(struct nameidata *nd, int type) if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; - struct inode *inode; - unsigned seq; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); @@ -1961,17 +1947,12 @@ static const char *handle_dots(struct nameidata *nd, int type) return error; } if (nd->flags & LOOKUP_RCU) - parent = follow_dotdot_rcu(nd, &inode, &seq); + parent = follow_dotdot_rcu(nd); else - parent = follow_dotdot(nd, &inode, &seq); + parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); - if (unlikely(!parent)) - error = step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq); - else - error = step_into(nd, WALK_NOFOLLOW, - parent, inode, seq); + error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; @@ -1983,9 +1964,9 @@ static const char *handle_dots(struct nameidata *nd, int type) * some fallback). */ smp_rmb(); - if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); - if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } @@ -1995,8 +1976,6 @@ static const char *handle_dots(struct nameidata *nd, int type) static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; - struct inode *inode; - unsigned seq; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and @@ -2007,7 +1986,7 @@ static const char *walk_component(struct nameidata *nd, int flags) put_link(nd); return handle_dots(nd, nd->last_type); } - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { @@ -2017,7 +1996,7 @@ static const char *walk_component(struct nameidata *nd, int flags) } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - return step_into(nd, flags, dentry, inode, seq); + return step_into(nd, flags, dentry); } /* @@ -2372,6 +2351,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); + else + nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; @@ -2473,8 +2454,8 @@ static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); - return PTR_ERR(step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq)); + nd->next_seq = nd->seq; + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -3393,8 +3374,6 @@ static const char *open_last_lookups(struct nameidata *nd, struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; - unsigned seq; - struct inode *inode; struct dentry *dentry; const char *res; @@ -3410,7 +3389,7 @@ static const char *open_last_lookups(struct nameidata *nd, if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) @@ -3464,7 +3443,7 @@ static const char *open_last_lookups(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - res = step_into(nd, WALK_TRAILING, dentry, inode, seq); + res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; diff --git a/fs/namespace.c b/fs/namespace.c index e6a7e769d25d..68789f896f08 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -648,7 +648,7 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) } /* call under rcu_read_lock */ -bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 8742d22dfd2b..0ce535852151 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -155,7 +155,7 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq, void netfs_readahead(struct readahead_control *ractl) { struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host); + struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); int ret; _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); @@ -215,7 +215,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) { struct address_space *mapping = folio_file_mapping(folio); struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(mapping->host); + struct netfs_inode *ctx = netfs_inode(mapping->host); int ret; _enter("%lx", folio_index(folio)); @@ -297,6 +297,7 @@ zero_out: /** * netfs_write_begin - Helper to prepare for writing + * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from * @pos: File position at which the write will begin @@ -318,20 +319,21 @@ zero_out: * conflicting writes once the folio is grabbed and locked. It is passed a * pointer to the fsdata cookie that gets returned to the VM to be passed to * write_end. It is permitted to sleep. It should return 0 if the request - * should go ahead; unlock the folio and return -EAGAIN to cause the folio to - * be regot; or return an error. + * should go ahead or it may return an error. It may also unlock and put the + * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 + * will cause the folio to be re-got and the process to be retried. * * The calling netfs must initialise a netfs context contiguous to the vfs * inode before calling this. * * This is usable whether or not caching is enabled. */ -int netfs_write_begin(struct file *file, struct address_space *mapping, +int netfs_write_begin(struct netfs_inode *ctx, + struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **_folio, void **_fsdata) { struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(file_inode(file )); struct folio *folio; unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; pgoff_t index = pos >> PAGE_SHIFT; @@ -347,13 +349,13 @@ retry: if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ - ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata); + ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); if (ret < 0) { trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); - if (ret == -EAGAIN) - goto retry; goto error; } + if (!folio) + goto retry; } if (folio_test_uptodate(folio)) @@ -415,8 +417,10 @@ have_folio_no_wait: error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); error: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } _leave(" = %d", ret); return ret; } diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index b7b0e3d18d9e..43fac1b14e40 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -91,7 +91,7 @@ static inline void netfs_stat_d(atomic_t *stat) /* * Miscellaneous functions. */ -static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx) +static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) { #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cookie = ctx->cache; diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e86107b30ba4..e17cdf53f6a7 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -18,7 +18,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, { static atomic_t debug_ids; struct inode *inode = file ? file_inode(file) : mapping->host; - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; int ret; @@ -75,10 +75,10 @@ static void netfs_free_request(struct work_struct *work) struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); - netfs_clear_subrequests(rreq, false); - if (rreq->netfs_priv) - rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv); trace_netfs_rreq(rreq, netfs_rreq_trace_free); + netfs_clear_subrequests(rreq, false); + if (rreq->netfs_ops->free_request) + rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); kfree(rreq); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 79a8b451791f..943aeea1eb16 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -121,7 +121,7 @@ static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map) } static struct bio * -do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, +do_add_page_to_bio(struct bio *bio, int npg, enum req_op op, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, struct pnfs_block_extent *be, bio_end_io_t end_io, struct parallel_io *par, unsigned int offset, int *len) @@ -131,7 +131,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, u64 disk_addr, end; dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, - npg, rw, (unsigned long long)isect, offset, *len); + npg, (__force u32)op, (unsigned long long)isect, offset, *len); /* translate to device offset */ isect += be->be_v_offset; @@ -154,7 +154,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, retry: if (!bio) { - bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO); + bio = bio_alloc(map->bdev, bio_max_segs(npg), op, GFP_NOIO); bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT; bio->bi_end_io = end_io; bio->bi_private = par; @@ -291,7 +291,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) } else { bio = do_add_page_to_bio(bio, header->page_array.npages - i, - READ, + REQ_OP_READ, isect, pages[i], &map, &be, bl_end_io_read, par, pg_offset, &pg_len); @@ -420,9 +420,8 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) pg_len = PAGE_SIZE; bio = do_add_page_to_bio(bio, header->page_array.npages - i, - WRITE, isect, pages[i], &map, &be, - bl_end_io_write, par, - 0, &pg_len); + REQ_OP_WRITE, isect, pages[i], &map, + &be, bl_end_io_write, par, 0, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c8520284dda7..c1eda73254e1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -288,6 +288,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, rv = NFS4_OK; break; case -ENOENT: + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); /* Embrace your forgetfulness! */ rv = NFS4ERR_NOMATCHING_LAYOUT; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a8ecdd527662..0c4e8dd6aa96 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2124,6 +2124,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, } goto out; } + file->f_mode |= FMODE_CAN_ODIRECT; err = nfs_finish_open(ctx, ctx->dentry, file, open_flags); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d72b1b7ed74..549baed76351 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -533,9 +533,7 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidate_folio = nfs_invalidate_folio, .release_folio = nfs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = nfs_migrate_page, -#endif + .migrate_folio = nfs_migrate_folio, .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8f8cd6e2d4db..437ebe544aaf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -578,8 +578,10 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) #endif #ifdef CONFIG_MIGRATION -extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *, enum migrate_mode); +int nfs_migrate_folio(struct address_space *, struct folio *dst, + struct folio *src, enum migrate_mode); +#else +#define nfs_migrate_folio NULL #endif static inline int diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 03d3a270eff4..e88f6b18445e 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -93,6 +93,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_file_set_open_context(filp, ctx); nfs_fscache_open_file(inode, filp); err = 0; + filp->f_mode |= FMODE_CAN_ODIRECT; out_put_ctx: put_nfs_open_context(ctx); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c0fdcf8c0032..bb0e84a46d61 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4012,22 +4012,29 @@ static int _nfs4_discover_trunking(struct nfs_server *server, } page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); - if (page == NULL || locations == NULL) - goto out; + if (!locations) + goto out_free; + locations->fattr = nfs_alloc_fattr(); + if (!locations->fattr) + goto out_free_2; status = nfs4_proc_get_locations(server, fhandle, locations, page, cred); if (status) - goto out; + goto out_free_3; for (i = 0; i < locations->nlocations; i++) test_fs_location_for_trunking(&locations->locations[i], clp, server); -out: - if (page) - __free_page(page); +out_free_3: + kfree(locations->fattr); +out_free_2: kfree(locations); +out_free: + __free_page(page); return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2540b35ec187..9bab3e9c702a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2753,5 +2753,6 @@ again: goto again; nfs_put_client(clp); + module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 68a87be3e6f9..41a9b6b58fb9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -469,6 +469,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, pnfs_clear_lseg_state(lseg, lseg_list); pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) pnfs_clear_layoutreturn_waitbit(lo); @@ -1917,8 +1918,9 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) { - if (atomic_dec_and_test(&lo->plh_outstanding)) - wake_up_var(&lo->plh_outstanding); + if (atomic_dec_and_test(&lo->plh_outstanding) && + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); } static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) @@ -2025,11 +2027,11 @@ lookup_again: * If the layout segment list is empty, but there are outstanding * layoutget calls, then they might be subject to a layoutrecall. */ - if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) && + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); - lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, - !atomic_read(&lo->plh_outstanding))); + lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN, + TASK_KILLABLE)); if (IS_ERR(lseg)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); @@ -2152,6 +2154,12 @@ lookup_again: case -ERECALLCONFLICT: case -EAGAIN: break; + case -ENODATA: + /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */ + pnfs_layout_set_fail_bit( + lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + goto out_put_layout_hdr; default: if (!nfs_error_is_fatal(PTR_ERR(lseg))) { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); @@ -2407,7 +2415,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } - if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo)) + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && + !pnfs_is_first_layoutget(lo)) goto out_forget; if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 07f11489e4e9..f331f067691b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -105,6 +105,7 @@ enum { NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ NFS_LAYOUT_HASHED, /* The layout visible */ + NFS_LAYOUT_DRAIN, }; enum layoutdriver_policy_flags { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 5a9b043662e9..8ae2c8d1219d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -120,12 +120,8 @@ static void nfs_readpage_release(struct nfs_page *req, int error) if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) SetPageError(page); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - struct address_space *mapping = page_file_mapping(page); - if (PageUptodate(page)) nfs_fscache_write_page(inode, page); - else if (!PageError(page) && !PagePrivate(page)) - generic_error_remove_page(mapping, page); unlock_page(page); } nfs_release_request(req); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1c706465d090..69569696dde0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2119,27 +2119,27 @@ out_error: } #ifdef CONFIG_MIGRATION -int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +int nfs_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { /* - * If PagePrivate is set, then the page is currently associated with + * If the private flag is set, the folio is currently associated with * an in-progress read or write request. Don't try to migrate it. * * FIXME: we could do this in principle, but we'll need a way to ensure * that we can safely release the inode reference while holding - * the page lock. + * the folio lock. */ - if (PagePrivate(page)) + if (folio_test_private(src)) return -EBUSY; - if (PageFsCache(page)) { + if (folio_test_fscache(src)) { if (mode == MIGRATE_ASYNC) return -EBUSY; - wait_on_page_fscache(page); + folio_wait_fscache(src); } - return migrate_page(mapping, newpage, page, mode); + return migrate_folio(mapping, dst, src, mode); } #endif diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index f172412447f5..9cb2d590c036 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -309,11 +309,12 @@ nfsd_file_put(struct nfsd_file *nf) if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { nfsd_file_flush(nf); nfsd_file_put_noref(nf); - } else { + } else if (nf->nf_file) { nfsd_file_put_noref(nf); - if (nf->nf_file) - nfsd_file_schedule_laundrette(); - } + nfsd_file_schedule_laundrette(); + } else + nfsd_file_put_noref(nf); + if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) nfsd_file_gc(); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 61b2aae81abb..2acea7792bb2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -470,6 +470,15 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, return nfserr_bad_xdr; } } + if (bmval[1] & FATTR4_WORD1_TIME_CREATE) { + struct timespec64 ts; + + /* No Linux filesystem supports setting this attribute. */ + bmval[1] &= ~FATTR4_WORD1_TIME_CREATE; + status = nfsd4_decode_nfstime4(argp, &ts); + if (status) + return status; + } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { u32 set_it; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 847b482155ae..9a8b09afc173 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -465,7 +465,8 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) #define NFSD_WRITEABLE_ATTRS_WORD1 \ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ - | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_CREATE \ + | FATTR4_WORD1_TIME_MODIFY_SET) #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #define MAYBE_FATTR4_WORD2_SECURITY_LABEL \ FATTR4_WORD2_SECURITY_LABEL diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 840e3af63a6f..d79db56475d4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -577,6 +577,7 @@ out_err: ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { + ssize_t ret; /* * Limit copy to 4MB to prevent indefinitely blocking an nfsd @@ -587,7 +588,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, * limit like this and pipeline multiple COPY requests. */ count = min_t(u64, count, 1 << 22); - return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src, src_pos, dst, dst_pos, + count, 0); + return ret; } __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1173,6 +1179,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, nfsd_copy_write_verifier(verf, nn); err2 = filemap_check_wb_err(nf->nf_file->f_mapping, since); + err = nfserrno(err2); break; case -EINVAL: err = nfserr_notsupp; @@ -1180,8 +1187,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, default: nfsd_reset_write_verifier(nn); trace_nfsd_writeverf_reset(nn, rqstp, err2); + err = nfserrno(err2); } - err = nfserrno(err2); } else nfsd_copy_write_verifier(verf, nn); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index ca611ac09f7c..e74fda212620 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -70,7 +70,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) } int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, int mode, int mode_flags, + sector_t pblocknr, blk_opf_t opf, struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; @@ -103,13 +103,13 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, } } - if (mode_flags & REQ_RAHEAD) { + if (opf & REQ_RAHEAD) { if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { err = -EBUSY; /* internal code */ brelse(bh); goto out_locked; } - } else { /* mode == READ */ + } else { /* opf == REQ_OP_READ */ lock_buffer(bh); } if (buffer_uptodate(bh)) { @@ -122,7 +122,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(mode, mode_flags, bh); + submit_bh(opf, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ *submit_ptr = pblocknr; err = 0; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index bd5544e63a01..4bc5612dff94 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -34,8 +34,8 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); -int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int, - int, struct buffer_head **, sector_t *); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, + blk_opf_t, struct buffer_head **, sector_t *); void nilfs_btnode_delete(struct buffer_head *); int nilfs_btnode_prepare_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index f544c22fff78..9f4d9432d38a 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -477,7 +477,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, sector_t submit_ptr = 0; int ret; - ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, 0, &bh, + ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh, &submit_ptr); if (ret) { if (ret != -EEXIST) @@ -495,8 +495,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); ret = nilfs_btnode_submit_block(btnc, ptr2, 0, - REQ_OP_READ, REQ_RAHEAD, - &ra_bh, &submit_ptr); + REQ_OP_READ | REQ_RAHEAD, + &ra_bh, &submit_ptr); if (likely(!ret || ret == -EEXIST)) brelse(ra_bh); else if (ret != -EBUSY) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index f8f4c2ff52f4..decd6471300b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -194,7 +194,7 @@ static struct page *nilfs_get_page(struct inode *dir, unsigned long n) if (!IS_ERR(page)) { kmap(page); if (unlikely(!PageChecked(page))) { - if (PageError(page) || !nilfs_check_page(page)) + if (!nilfs_check_page(page)) goto fail; } } diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 04fdd420eae7..b0d22ff24b67 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -92,7 +92,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); if (vbn) bh->b_blocknr = vbn; out: @@ -129,9 +129,8 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode; int ret; - ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, - vbn ? : pbn, pbn, REQ_OP_READ, 0, - out_bh, &pbn); + ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, vbn ? : pbn, pbn, + REQ_OP_READ, out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ ret = 0; return ret; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index d29a0f2b9c16..cbf4fa60eea2 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -111,8 +111,8 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, } static int -nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, - int mode, int mode_flags, struct buffer_head **out_bh) +nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf, + struct buffer_head **out_bh) { struct buffer_head *bh; __u64 blknum = 0; @@ -126,12 +126,12 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, if (buffer_uptodate(bh)) goto out; - if (mode_flags & REQ_RAHEAD) { + if (opf & REQ_RAHEAD) { if (!trylock_buffer(bh)) { ret = -EBUSY; goto failed_bh; } - } else /* mode == READ */ + } else /* opf == REQ_OP_READ */ lock_buffer(bh); if (buffer_uptodate(bh)) { @@ -148,10 +148,11 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(mode, mode_flags, bh); + submit_bh(opf, bh); ret = 0; - trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); + trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, + opf & REQ_OP_MASK); out: get_bh(bh); *out_bh = bh; @@ -172,7 +173,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; int err; - err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, 0, &first_bh); + err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, &first_bh); if (err == -EEXIST) /* internal code */ goto out; @@ -182,8 +183,8 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, if (readahead) { blkoff = block + 1; for (i = 0; i < nr_ra_blocks; i++, blkoff++) { - err = nilfs_mdt_submit_block(inode, blkoff, REQ_OP_READ, - REQ_RAHEAD, &bh); + err = nilfs_mdt_submit_block(inode, blkoff, + REQ_OP_READ | REQ_RAHEAD, &bh); if (likely(!err || err == -EEXIST)) brelse(bh); else if (err != -EBUSY) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 1344f7d475d3..aecda4fc95f5 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -198,6 +198,9 @@ static inline int nilfs_acl_chmod(struct inode *inode) static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) { + if (S_ISLNK(inode->i_mode)) + return 0; + inode->i_mode &= ~current_umask(); return 0; } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index a8e88cc38e16..3267e96c256c 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -294,57 +294,57 @@ repeat: void nilfs_copy_back_pages(struct address_space *dmap, struct address_space *smap) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i, n; - pgoff_t index = 0; + pgoff_t start = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - n = pagevec_lookup(&pvec, smap, &index); + n = filemap_get_folios(smap, &start, ~0UL, &fbatch); if (!n) return; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i], *dpage; - pgoff_t offset = page->index; - - lock_page(page); - dpage = find_lock_page(dmap, offset); - if (dpage) { - /* overwrite existing page in the destination cache */ - WARN_ON(PageDirty(dpage)); - nilfs_copy_page(dpage, page, 0); - unlock_page(dpage); - put_page(dpage); - /* Do we not need to remove page from smap here? */ + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i], *dfolio; + pgoff_t index = folio->index; + + folio_lock(folio); + dfolio = filemap_lock_folio(dmap, index); + if (dfolio) { + /* overwrite existing folio in the destination cache */ + WARN_ON(folio_test_dirty(dfolio)); + nilfs_copy_page(&dfolio->page, &folio->page, 0); + folio_unlock(dfolio); + folio_put(dfolio); + /* Do we not need to remove folio from smap here? */ } else { - struct page *p; + struct folio *f; - /* move the page to the destination cache */ + /* move the folio to the destination cache */ xa_lock_irq(&smap->i_pages); - p = __xa_erase(&smap->i_pages, offset); - WARN_ON(page != p); + f = __xa_erase(&smap->i_pages, index); + WARN_ON(folio != f); smap->nrpages--; xa_unlock_irq(&smap->i_pages); xa_lock_irq(&dmap->i_pages); - p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS); - if (unlikely(p)) { + f = __xa_store(&dmap->i_pages, index, folio, GFP_NOFS); + if (unlikely(f)) { /* Probably -ENOMEM */ - page->mapping = NULL; - put_page(page); + folio->mapping = NULL; + folio_put(folio); } else { - page->mapping = dmap; + folio->mapping = dmap; dmap->nrpages++; - if (PageDirty(page)) - __xa_set_mark(&dmap->i_pages, offset, + if (folio_test_dirty(folio)) + __xa_set_mark(&dmap->i_pages, index, PAGECACHE_TAG_DIRTY); } xa_unlock_irq(&dmap->i_pages); } - unlock_page(page); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 4f897e109547..cd7d09a569ff 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -295,12 +295,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, const void *data, int data_type, struct inode *dir) { - __u32 marks_mask = 0, marks_ignored_mask = 0; + __u32 marks_mask = 0, marks_ignore_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | FANOTIFY_EVENT_FLAGS; const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct fsnotify_mark *mark; + bool ondir = event_mask & FAN_ONDIR; int type; pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", @@ -315,19 +316,21 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, return 0; } else if (!(fid_mode & FAN_REPORT_FID)) { /* Do we have a directory inode to report? */ - if (!dir && !(event_mask & FS_ISDIR)) + if (!dir && !ondir) return 0; } fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - /* Apply ignore mask regardless of mark's ISDIR flag */ - marks_ignored_mask |= mark->ignored_mask; + /* + * Apply ignore mask depending on event flags in ignore mask. + */ + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, ondir, type); /* - * If the event is on dir and this mark doesn't care about - * events on dir, don't send it! + * Send the event depending on event flags in mark mask. */ - if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR)) + if (!fsnotify_mask_applicable(mark->mask, ondir, type)) continue; marks_mask |= mark->mask; @@ -336,7 +339,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, *match_mask |= 1U << type; } - test_mask = event_mask & marks_mask & ~marks_ignored_mask; + test_mask = event_mask & marks_mask & ~marks_ignore_mask; /* * For dirent modification events (create/delete/move) that do not carry diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 80e0ec95b113..1d9f11255c64 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -499,6 +499,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF) mflags |= FAN_MARK_EVICTABLE; + if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + mflags |= FAN_MARK_IGNORE; return mflags; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c2255b440df9..f0e49a406ffa 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1009,10 +1009,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, mask &= ~umask; spin_lock(&fsn_mark->lock); oldmask = fsnotify_calc_mask(fsn_mark); - if (!(flags & FAN_MARK_IGNORED_MASK)) { + if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { fsn_mark->mask &= ~mask; } else { - fsn_mark->ignored_mask &= ~mask; + fsn_mark->ignore_mask &= ~mask; } newmask = fsnotify_calc_mask(fsn_mark); /* @@ -1021,7 +1021,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, * changes to the mask. * Destroy mark when only umask bits remain. */ - *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); + *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); spin_unlock(&fsn_mark->lock); return oldmask & ~newmask; @@ -1085,15 +1085,24 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); + unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; bool recalc = false; /* + * When using FAN_MARK_IGNORE for the first time, mark starts using + * independent event flags in ignore mask. After that, trying to + * update the ignore mask with the old FAN_MARK_IGNORED_MASK API + * will result in EEXIST error. + */ + if (ignore == FAN_MARK_IGNORE) + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; + + /* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set - * because of an ignored mask that is now going to survive FS_MODIFY. + * because of an ignore mask that is now going to survive FS_MODIFY. */ - if ((fan_flags & FAN_MARK_IGNORED_MASK) && - (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY)) @@ -1120,10 +1129,10 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, bool recalc; spin_lock(&fsn_mark->lock); - if (!(fan_flags & FAN_MARK_IGNORED_MASK)) + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) fsn_mark->mask |= mask; else - fsn_mark->ignored_mask |= mask; + fsn_mark->ignore_mask |= mask; recalc = fsnotify_calc_mask(fsn_mark) & ~fsnotify_conn_mask(fsn_mark->connector); @@ -1187,6 +1196,37 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) sizeof(struct fanotify_error_event)); } +static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, + unsigned int fan_flags) +{ + /* + * Non evictable mark cannot be downgraded to evictable mark. + */ + if (fan_flags & FAN_MARK_EVICTABLE && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + return -EEXIST; + + /* + * New ignore mask semantics cannot be downgraded to old semantics. + */ + if (fan_flags & FAN_MARK_IGNORED_MASK && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + return -EEXIST; + + /* + * An ignore mask that survives modify could never be downgraded to not + * survive modify. With new FAN_MARK_IGNORE semantics we make that rule + * explicit and return an error when trying to update the ignore mask + * without the original FAN_MARK_IGNORED_SURV_MODIFY value. + */ + if (fan_flags & FAN_MARK_IGNORE && + !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + return -EEXIST; + + return 0; +} + static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, @@ -1208,19 +1248,18 @@ static int fanotify_add_mark(struct fsnotify_group *group, } /* - * Non evictable mark cannot be downgraded to evictable mark. + * Check if requested mark flags conflict with an existing mark flags. */ - if (fan_flags & FAN_MARK_EVICTABLE && - !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) { - ret = -EEXIST; + ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); + if (ret) goto out; - } /* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested). */ - if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && + (mask & FAN_FS_ERROR)) { ret = fanotify_group_init_error_pool(group); if (ret) goto out; @@ -1261,10 +1300,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, /* * If some other task has this inode open for write we should not add - * an ignored mark, unless that ignored mark is supposed to survive + * an ignore mask, unless that ignore mask is supposed to survive * modification changes anyway. */ - if ((flags & FAN_MARK_IGNORED_MASK) && + if ((flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && inode_is_open_for_write(inode)) return 0; @@ -1513,8 +1552,16 @@ static int fanotify_test_fid(struct dentry *dentry) return 0; } -static int fanotify_events_supported(struct path *path, __u64 mask) +static int fanotify_events_supported(struct fsnotify_group *group, + struct path *path, __u64 mask, + unsigned int flags) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ + bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || + (mask & FAN_RENAME) || + (flags & FAN_MARK_IGNORE); + /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of @@ -1526,6 +1573,16 @@ static int fanotify_events_supported(struct path *path, __u64 mask) if (mask & FANOTIFY_PERM_EVENTS && path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) return -EINVAL; + + /* + * We shouldn't have allowed setting dirent events and the directory + * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, + * but because we always allowed it, error only when using new APIs. + */ + if (strict_dir_events && mark_type == FAN_MARK_INODE && + !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) + return -ENOTDIR; + return 0; } @@ -1540,7 +1597,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; - bool ignored = flags & FAN_MARK_IGNORED_MASK; + unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; + unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; u32 umask = 0; int ret; @@ -1569,7 +1627,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; } - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + switch (mark_cmd) { case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) @@ -1589,9 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ~valid_mask) return -EINVAL; - /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */ - if (ignored) + + /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ + if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) + return -EINVAL; + + /* + * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with + * FAN_MARK_IGNORED_MASK. + */ + if (ignore == FAN_MARK_IGNORED_MASK) { mask &= ~FANOTIFY_EVENT_FLAGS; + umask = FANOTIFY_EVENT_FLAGS; + } f = fdget(fanotify_fd); if (unlikely(!f.file)) @@ -1655,7 +1723,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) goto fput_and_out; - if (flags & FAN_MARK_FLUSH) { + if (mark_cmd == FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); @@ -1671,8 +1739,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto fput_and_out; - if (flags & FAN_MARK_ADD) { - ret = fanotify_events_supported(&path, mask); + if (mark_cmd == FAN_MARK_ADD) { + ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; } @@ -1695,17 +1763,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; - /* - * FAN_RENAME is not allowed on non-dir (for now). - * We shouldn't have allowed setting any dirent events in mask of - * non-dir, but because we always allowed it, error only if group - * was initialized with the new flag FAN_REPORT_TARGET_FID. - */ - ret = -ENOTDIR; - if (inode && !S_ISDIR(inode->i_mode) && - ((mask & FAN_RENAME) || - ((mask & FANOTIFY_DIRENT_EVENTS) && - FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) + ret = mnt ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && + (mnt || S_ISDIR(inode->i_mode)) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) goto path_put_and_out; /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ @@ -1717,12 +1779,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * events with parent/name info for non-directory. */ if ((fid_mode & FAN_REPORT_DIR_FID) && - (flags & FAN_MARK_ADD) && !ignored) + (flags & FAN_MARK_ADD) && !ignore) mask |= FAN_EVENT_ON_CHILD; } /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { + switch (mark_cmd) { case FAN_MARK_ADD: if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, @@ -1800,7 +1862,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 59fb40abe33d..55081ae3a6ec 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -113,7 +113,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, - mflags, mark->mask, mark->ignored_mask); + mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); @@ -121,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct mount *mnt = fsnotify_conn_mount(mark->connector); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", - mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + mnt->mnt_id, mflags, mark->mask, mark->ignore_mask); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { struct super_block *sb = fsnotify_conn_sb(mark->connector); seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", - sb->s_dev, mflags, mark->mask, mark->ignored_mask); + sb->s_dev, mflags, mark->mask, mark->ignore_mask); } } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 0b3e74935cb4..7974e91ffe13 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -100,7 +100,7 @@ void fsnotify_sb_delete(struct super_block *sb) * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the - * parent cares. Thus when an event happens on a child it can quickly tell if + * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void __fsnotify_update_child_dentry_flags(struct inode *inode) @@ -324,7 +324,8 @@ static int send_to_group(__u32 mask, const void *data, int data_type, struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; - __u32 marks_ignored_mask = 0; + __u32 marks_ignore_mask = 0; + bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; @@ -336,7 +337,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) - mark->ignored_mask = 0; + mark->ignore_mask = 0; } } @@ -344,14 +345,15 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; - marks_ignored_mask |= mark->ignored_mask; + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, is_dir, type); } - pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", - __func__, group, mask, marks_mask, marks_ignored_mask, + pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", + __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); - if (!(test_mask & marks_mask & ~marks_ignored_mask)) + if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { @@ -423,7 +425,8 @@ static bool fsnotify_iter_select_report_types( * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && - !(mark->mask & FS_EVENT_ON_CHILD)) + !(mark->mask & FS_EVENT_ON_CHILD) && + !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); @@ -532,8 +535,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, /* - * If this is a modify event we may need to clear some ignored masks. - * In that case, the object with ignored masks will have the FS_MODIFY + * If this is a modify event we may need to clear some ignore masks. + * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ed42a189faa2..1c4bfdab008d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -136,7 +136,7 @@ static inline u32 inotify_mask_to_arg(__u32 mask) IN_Q_OVERFLOW); } -/* intofiy userspace file descriptor functions */ +/* inotify userspace file descriptor functions */ static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 9e3964ea2ea0..9364d35b4a10 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -342,7 +342,7 @@ handle_zblock: for (i = 0; i < nr; i++) { tbh = arr[i]; if (likely(!buffer_uptodate(tbh))) - submit_bh(REQ_OP_READ, 0, tbh); + submit_bh(REQ_OP_READ, tbh); else ntfs_end_buffer_async_read(tbh, 1); } @@ -859,7 +859,7 @@ lock_retry_remap: do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); need_end_writeback = false; } bh = next; @@ -1187,7 +1187,7 @@ lock_retry_remap: BUG_ON(!buffer_mapped(tbh)); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, 0, tbh); + submit_bh(REQ_OP_WRITE, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (is_mft && !sync) @@ -1659,7 +1659,7 @@ const struct address_space_operations ntfs_normal_aops = { .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ .bmap = ntfs_bmap, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_compressed_aops = { .writepage = ntfs_writepage, .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1688,7 +1688,7 @@ const struct address_space_operations ntfs_mst_aops = { .writepage = ntfs_writepage, /* Write dirty page to disk. */ .dirty_folio = filemap_dirty_folio, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h index 934d5f79b9e7..0cac5458c023 100644 --- a/fs/ntfs/aops.h +++ b/fs/ntfs/aops.h @@ -74,13 +74,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping, { struct page *page = read_mapping_page(mapping, index, NULL); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) kmap(page); - if (!PageError(page)) - return page; - ntfs_unmap_page(page); - return ERR_PTR(-EIO); - } return page; } diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 4de597a83b88..52615e6090e1 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -592,8 +592,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, a = (ATTR_RECORD*)((u8*)ctx->attr + le32_to_cpu(ctx->attr->length)); for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) + u8 *mrec_end = (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated); + u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || + name_end > mrec_end) break; ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index a60f543e7557..587e9b187873 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -658,7 +658,7 @@ lock_retry_remap: } get_bh(tbh); tbh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, tbh); + submit_bh(REQ_OP_READ, tbh); } /* Wait for io completion on all buffer heads. */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index a8abe2296514..58b660dbbee9 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -219,11 +219,6 @@ do_non_resident_extend: err = PTR_ERR(page); goto init_err_out; } - if (unlikely(PageError(page))) { - put_page(page); - err = -EIO; - goto init_err_out; - } /* * Update the initialized size in the ntfs inode. This is * enough to make ntfs_writepage() work. @@ -537,7 +532,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) lock_buffer(bh); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - return submit_bh(REQ_OP_READ, 0, bh); + return submit_bh(REQ_OP_READ, bh); } /** diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index bc1bf217b38e..6ce60ffc6ac0 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -807,7 +807,7 @@ map_vcn: * completed ignore errors afterwards as we can assume * that if one buffer worked all of them will work. */ - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); if (should_wait) { should_wait = false; wait_on_buffer(bh); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 0d62cd5bb7f8..f7bf5ce960cc 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -583,7 +583,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, 0, tbh); + submit_bh(REQ_OP_WRITE, tbh); } /* Wait on i/o completion of buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { @@ -780,7 +780,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, 0, tbh); + submit_bh(REQ_OP_WRITE, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (!sync && ni->mft_no < vol->mftmirr_size) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 8e9d2b35175f..4a21745711fe 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -242,7 +242,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) lock_buffer(bh); bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 3de5700a9b83..1835e35199c2 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -1448,7 +1448,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, */ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, - u32 op) + enum req_op op) { int err = 0; struct bio *new, *bio = NULL; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index be4ebdd8048b..80104afeb2cd 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -629,7 +629,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); set_bh_page(bh, page, off); - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { err = -EIO; @@ -851,12 +851,10 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct inode *inode = mapping->host; - struct ntfs_inode *ni = ntfs_i(inode); /* Redirect call to 'ntfs_writepage' for resident files. */ - get_block_t *get_block = is_resident(ni) ? NULL : &ntfs_get_block; - - return mpage_writepages(mapping, wbc, get_block); + if (is_resident(ntfs_i(mapping->host))) + return generic_writepages(mapping, wbc); + return mpage_writepages(mapping, wbc, ntfs_get_block); } static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 8de129a6419b..8dbdca03e1af 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -617,7 +617,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, struct ntfs_buffers *nb, int sync); int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, - u32 op); + enum req_op op); int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run); int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u64 *lbo, u64 *bytes); @@ -896,13 +896,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping, { struct page *page = read_mapping_page(mapping, index, NULL); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) kmap(page); - if (!PageError(page)) - return page; - ntfs_unmap_page(page); - return ERR_PTR(-EIO); - } return page; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 35d40a67204c..af4157f61927 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -277,16 +277,14 @@ out: static int ocfs2_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start = (loff_t)page->index << PAGE_SHIFT; + loff_t start = folio_pos(folio); int ret, unlock = 1; - trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, - (page ? page->index : 0)); + trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index); - ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); + ret = ocfs2_inode_lock_with_page(inode, NULL, 0, &folio->page); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -296,11 +294,11 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) if (down_read_trylock(&oi->ip_alloc_sem) == 0) { /* - * Unlock the page and cycle ip_alloc_sem so that we don't + * Unlock the folio and cycle ip_alloc_sem so that we don't * busyloop waiting for ip_alloc_sem to unlock */ ret = AOP_TRUNCATED_PAGE; - unlock_page(page); + folio_unlock(folio); unlock = 0; down_read(&oi->ip_alloc_sem); up_read(&oi->ip_alloc_sem); @@ -313,21 +311,21 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) * block_read_full_folio->get_block freaks out if it is asked to read * beyond the end of a file, so we check here. Callers * (generic_file_read, vm_ops->fault) are clever enough to check i_size - * and notice that the page they just read isn't needed. + * and notice that the folio they just read isn't needed. * * XXX sys_readahead() seems to get that wrong? */ if (start >= i_size_read(inode)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); ret = 0; goto out_alloc; } if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - ret = ocfs2_readpage_inline(inode, page); + ret = ocfs2_readpage_inline(inode, &folio->page); else - ret = block_read_full_folio(page_folio(page), ocfs2_get_block); + ret = block_read_full_folio(folio, ocfs2_get_block); unlock = 0; out_alloc: @@ -336,7 +334,7 @@ out_inode_unlock: ocfs2_inode_unlock(inode, 0); out: if (unlock) - unlock_page(page); + folio_unlock(folio); return ret; } @@ -638,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); *wait_bh++=bh; } @@ -2464,7 +2462,7 @@ const struct address_space_operations ocfs2_aops = { .direct_IO = ocfs2_direct_IO, .invalidate_folio = block_invalidate_folio, .release_folio = ocfs2_release_folio, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index e7758778abef..196638a22b48 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -64,7 +64,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); wait_on_buffer(bh); @@ -147,7 +147,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); } read_failure: @@ -328,7 +328,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, if (validate) set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); continue; } } @@ -449,7 +449,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); wait_on_buffer(bh); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ea0e70c0fce0..b13d344d40b6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -235,8 +235,6 @@ struct o2hb_region { * (hr_steady_iterations == 0) within hr_unsteady_iterations */ atomic_t hr_unsteady_iterations; - char hr_dev_name[BDEVNAME_SIZE]; - unsigned int hr_timeout_ms; /* randomized as the region goes up and down so that a node @@ -287,8 +285,8 @@ static void o2hb_write_timeout(struct work_struct *work) container_of(work, struct o2hb_region, hr_write_timeout_work.work); - mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " - "milliseconds\n", reg->hr_dev_name, + mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u " + "milliseconds\n", reg->hr_bdev, jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { @@ -383,9 +381,9 @@ static void o2hb_nego_timeout(struct work_struct *work) if (master_node == o2nm_this_node()) { if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { - printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n", + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, - config_item_name(®->hr_item), reg->hr_dev_name); + config_item_name(®->hr_item), reg->hr_bdev); set_bit(master_node, reg->hr_nego_node_bitmap); } if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, @@ -399,8 +397,8 @@ static void o2hb_nego_timeout(struct work_struct *work) return; } - printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n", + config_item_name(®->hr_item), reg->hr_bdev); /* approve negotiate timeout request. */ o2hb_arm_timeout(reg); @@ -419,9 +417,9 @@ static void o2hb_nego_timeout(struct work_struct *work) } } else { /* negotiate timeout with master node. */ - printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n", + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), - reg->hr_dev_name, master_node); + reg->hr_bdev, master_node); ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, master_node); if (ret) @@ -437,8 +435,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, struct o2hb_nego_msg *nego_msg; nego_msg = (struct o2hb_nego_msg *)msg->buf; - printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n", - nego_msg->node_num, config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n", + nego_msg->node_num, config_item_name(®->hr_item), reg->hr_bdev); if (nego_msg->node_num < O2NM_MAX_NODES) set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); else @@ -452,8 +450,8 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, { struct o2hb_region *reg = data; - printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n", + config_item_name(®->hr_item), reg->hr_bdev); o2hb_arm_timeout(reg); return 0; } @@ -503,8 +501,7 @@ static void o2hb_bio_end_io(struct bio *bio) static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, unsigned int *current_slot, - unsigned int max_slots, int op, - int op_flags) + unsigned int max_slots, blk_opf_t opf) { int len, current_page; unsigned int vec_len, vec_start; @@ -518,7 +515,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC); + bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -566,7 +563,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, while(current_slot < max_slots) { bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots, - REQ_OP_READ, 0); + REQ_OP_READ); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -598,8 +595,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, - REQ_SYNC); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, + REQ_OP_WRITE | REQ_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -689,8 +686,8 @@ static int o2hb_check_own_slot(struct o2hb_region *reg) else errstr = ERRSTR3; - mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), " - "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name, + mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), " + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev, slot->ds_node_num, (unsigned long long)slot->ds_last_generation, (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), @@ -863,8 +860,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) sizeof(o2hb_live_node_bitmap))) goto unlock; - printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", + config_item_name(®->hr_item), reg->hr_bdev); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -922,8 +919,8 @@ static int o2hb_check_slot(struct o2hb_region *reg, /* The node is live but pushed out a bad crc. We * consider it a transient miss but don't populate any * other values as they may be junk. */ - mlog(ML_ERROR, "Node %d has written a bad crc to %s\n", - slot->ds_node_num, reg->hr_dev_name); + mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n", + slot->ds_node_num, reg->hr_bdev); o2hb_dump_slot(hb_block); slot->ds_equal_samples++; @@ -1002,11 +999,11 @@ fire_callbacks: slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); if (slot_dead_ms && slot_dead_ms != dead_ms) { /* TODO: Perhaps we can fail the region here. */ - mlog(ML_ERROR, "Node %d on device %s has a dead count " + mlog(ML_ERROR, "Node %d on device %pg has a dead count " "of %u ms, but our count is %u ms.\n" "Please double check your configuration values " "for 'O2CB_HEARTBEAT_THRESHOLD'\n", - slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, + slot->ds_node_num, reg->hr_bdev, slot_dead_ms, dead_ms); } goto out; @@ -1145,8 +1142,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to * disk */ - mlog(ML_ERROR, "Write error %d on device \"%s\"\n", - write_wc.wc_error, reg->hr_dev_name); + mlog(ML_ERROR, "Write error %d on device \"%pg\"\n", + write_wc.wc_error, reg->hr_bdev); ret = write_wc.wc_error; goto bail; } @@ -1170,9 +1167,9 @@ bail: if (atomic_read(®->hr_steady_iterations) != 0) { if (atomic_dec_and_test(®->hr_unsteady_iterations)) { printk(KERN_NOTICE "o2hb: Unable to stabilize " - "heartbeat on region %s (%s)\n", + "heartbeat on region %s (%pg)\n", config_item_name(®->hr_item), - reg->hr_dev_name); + reg->hr_bdev); atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); @@ -1494,7 +1491,7 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); - mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); + mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev); kfree(reg->hr_tmp_block); @@ -1641,7 +1638,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) unsigned int ret = 0; if (to_o2hb_region(item)->hr_bdev) - ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name); + ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev); return ret; } @@ -1798,8 +1795,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, goto out2; } - bdevname(reg->hr_bdev, reg->hr_dev_name); - sectsize = bdev_logical_block_size(reg->hr_bdev); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, @@ -1895,8 +1890,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, ret = -EIO; if (hb_task && o2hb_global_heartbeat_active()) - printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n", + config_item_name(®->hr_item), reg->hr_bdev); out3: if (ret < 0) { @@ -2088,10 +2083,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, quorum_region = 1; clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); spin_unlock(&o2hb_live_lock); - printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", + printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n", ((atomic_read(®->hr_steady_iterations) == 0) ? "stopped" : "start aborted"), config_item_name(item), - reg->hr_dev_name); + reg->hr_bdev); } /* diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7497cd592258..9c67edd215d5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (status) return status; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { status = dquot_initialize(inode); if (status) return status; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 337527571461..740b64238312 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -277,7 +277,6 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ - OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) - || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e04358a46b68..1358981e80a3 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3146,48 +3146,18 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, struct inode *inode, u32 cpos, u32 num_clusters) { - int ret = 0; - loff_t offset, end, map_end; - pgoff_t page_index; - struct page *page; + int ret; + loff_t start, end; if (ocfs2_should_order_data(inode)) return 0; - offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; - end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1; - ret = filemap_fdatawrite_range(inode->i_mapping, - offset, end - 1); - if (ret < 0) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret < 0) mlog_errno(ret); - return ret; - } - - while (offset < end) { - page_index = offset >> PAGE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; - if (map_end > end) - map_end = end; - - page = find_or_create_page(inode->i_mapping, - page_index, GFP_NOFS); - BUG_ON(!page); - - wait_on_page_writeback(page); - if (PageError(page)) { - ret = -EIO; - mlog_errno(ret); - } else - mark_page_accessed(page); - - unlock_page(page); - put_page(page); - page = NULL; - offset = map_end; - if (ret) - break; - } return ret; } diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 0b0ae3ebb0cf..da7718cef735 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid || - !si->si_slots[preferred].sl_node_num) { + if (!si->si_slots[preferred].sl_valid) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid || - !si->si_slots[i].sl_node_num) { + if (!si->si_slots[i].sl_valid) { ret = i; break; } @@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - if (ocfs2_mount_local(osb)) - /* use slot 0 directly in local mode */ - slot = 0; - else { - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot < 0) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); - if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " - "already allocated to this node!\n", - slot, osb->dev_str); - } + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " + "allocated to this node!\n", slot, osb->dev_str); ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f7298816d8d9..013a727bd7c8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -172,7 +172,6 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, - Opt_nocluster, Opt_err, }; @@ -206,7 +205,6 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, - {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } - tmp = OCFS2_MOUNT_NOCLUSTER; - if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { - ret = -EINVAL; - mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); - goto out; - } - tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && - !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1137,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); - if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && - !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) - printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " - "without cluster aware mode.\n", osb->dev_str); - atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1452,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; - case Opt_nocluster: - mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; - break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1566,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); - if (opts & OCFS2_MOUNT_NOCLUSTER) - seq_printf(s, ",nocluster"); - return 0; } @@ -1785,7 +1764,7 @@ static int ocfs2_get_sector(struct super_block *sb, if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); - ll_rw_block(REQ_OP_READ, 0, 1, bh); + ll_rw_block(REQ_OP_READ, 1, bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { mlog_errno(-EIO); diff --git a/fs/open.c b/fs/open.c index 1d57fbde2feb..8a813fa5ca56 100644 --- a/fs/open.c +++ b/fs/open.c @@ -663,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return do_fchmodat(AT_FDCWD, filename, mode); } +/** + * setattr_vfsuid - check and set ia_fsuid attribute + * @kuid: new inode owner + * + * Check whether @kuid is valid and if so generate and set vfsuid_t in + * ia_vfsuid. + * + * Return: true if @kuid is valid, false if not. + */ +static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) +{ + if (!uid_valid(kuid)) + return false; + attr->ia_valid |= ATTR_UID; + attr->ia_vfsuid = VFSUIDT_INIT(kuid); + return true; +} + +/** + * setattr_vfsgid - check and set ia_fsgid attribute + * @kgid: new inode owner + * + * Check whether @kgid is valid and if so generate and set vfsgid_t in + * ia_vfsgid. + * + * Return: true if @kgid is valid, false if not. + */ +static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) +{ + if (!gid_valid(kgid)) + return false; + attr->ia_valid |= ATTR_GID; + attr->ia_vfsgid = VFSGIDT_INIT(kgid); + return true; +} + int chown_common(const struct path *path, uid_t user, gid_t group) { struct user_namespace *mnt_userns, *fs_userns; @@ -678,28 +714,22 @@ int chown_common(const struct path *path, uid_t user, gid_t group) mnt_userns = mnt_user_ns(path->mnt); fs_userns = i_user_ns(inode); - uid = mapped_kuid_user(mnt_userns, fs_userns, uid); - gid = mapped_kgid_user(mnt_userns, fs_userns, gid); retry_deleg: newattrs.ia_valid = ATTR_CTIME; - if (user != (uid_t) -1) { - if (!uid_valid(uid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = uid; - } - if (group != (gid_t) -1) { - if (!gid_valid(gid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = gid; - } + if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) + return -EINVAL; + if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) + return -EINVAL; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); - error = security_path_chown(path, uid, gid); + /* Continue to send actual fs values, not the mount values. */ + error = security_path_chown( + path, + from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(mnt_userns, path->dentry, &newattrs, &delegated_inode); @@ -858,10 +888,13 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) + f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 5ce27dde3c79..7a8c0c6e698d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -307,7 +307,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, folio_size(folio), inode->i_size, NULL, NULL, file); - /* this will only zero remaining unread portions of the page data */ + /* this will only zero remaining unread portions of the folio data */ iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ flush_dcache_folio(folio); @@ -315,8 +315,6 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) folio_set_error(folio); } else { folio_mark_uptodate(folio); - if (folio_test_error(folio)) - folio_clear_error(folio); ret = 0; } /* unlock the folio after the ->read_folio() routine completes */ diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 714ec569d25b..fdde6c56cc3d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -226,8 +226,7 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, /* Couldn't clone, so now we try to copy the data */ /* Check if lower fs supports seek operation */ - if (old_file->f_mode & FMODE_LSEEK && - old_file->f_op->llseek) + if (old_file->f_mode & FMODE_LSEEK) skip_hole = true; while (len) { @@ -331,8 +330,8 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, if (!err) { struct iattr attr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = stat->uid, - .ia_gid = stat->gid, + .ia_vfsuid = VFSUIDT_INIT(stat->uid), + .ia_vfsgid = VFSGIDT_INIT(stat->gid), }; err = ovl_do_notify_change(ofs, upperdentry, &attr); } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 492eddeb481f..7922b619f6c8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -454,23 +454,94 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; } +/* + * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone + * of the POSIX ACLs retrieved from the lower layer to this function to not + * alter the POSIX ACLs for the underlying filesystem. + */ +static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, + struct posix_acl *acl) +{ + for (unsigned int i = 0; i < acl->a_count; i++) { + vfsuid_t vfsuid; + vfsgid_t vfsgid; + + struct posix_acl_entry *e = &acl->a_entries[i]; + switch (e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid); + e->e_uid = vfsuid_into_kuid(vfsuid); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid); + e->e_gid = vfsgid_into_kgid(vfsgid); + break; + } + } +} + +/* + * When the relevant layer is an idmapped mount we need to take the idmapping + * of the layer into account and translate any ACL_{GROUP,USER} values + * according to the idmapped mount. + * + * We cannot alter the ACLs returned from the relevant layer as that would + * alter the cached values filesystem wide for the lower filesystem. Instead we + * can clone the ACLs and then apply the relevant idmapping of the layer. + * + * This is obviously only relevant when idmapped layers are used. + */ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); - const struct cred *old_cred; - struct posix_acl *acl; + struct posix_acl *acl, *clone; + struct path realpath; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; - if (rcu) - return get_cached_acl_rcu(realinode, type); + /* Careful in RCU walk mode */ + ovl_i_path_real(inode, &realpath); + if (!realpath.dentry) { + WARN_ON(!rcu); + return ERR_PTR(-ECHILD); + } - old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); - revert_creds(old_cred); + if (rcu) { + acl = get_cached_acl_rcu(realinode, type); + } else { + const struct cred *old_cred; + + old_cred = ovl_override_creds(inode->i_sb); + acl = get_acl(realinode, type); + revert_creds(old_cred); + } + /* + * If there are no POSIX ACLs, or we encountered an error, + * or the layer isn't idmapped we don't need to do anything. + */ + if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) + return acl; - return acl; + /* + * We only get here if the layer is idmapped. So drop out of RCU path + * walk so we can clone the ACLs. There's no need to release the ACLs + * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + */ + if (rcu) + return ERR_PTR(-ECHILD); + + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) + clone = ERR_PTR(-ENOMEM); + else + ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone); + /* + * Since we're not in RCU path walk we always need to release the + * original ACLs. + */ + posix_acl_release(acl); + return clone; } int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4f34b7e02eee..6ec815b84d48 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -139,17 +139,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { - struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs); - struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry)); - - if (attr->ia_valid & ATTR_UID) - attr->ia_uid = mapped_kuid_user(upper_mnt_userns, - fs_userns, attr->ia_uid); - if (attr->ia_valid & ATTR_GID) - attr->ia_gid = mapped_kgid_user(upper_mnt_userns, - fs_userns, attr->ia_gid); - - return notify_change(upper_mnt_userns, upperdentry, attr, NULL); + return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, @@ -259,7 +249,8 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags); + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + (void *)value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 962d32468eb4..1d17d7b13dcd 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. */ -static struct posix_acl * +struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; @@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) } return clone; } +EXPORT_SYMBOL_GPL(posix_acl_clone); /* * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. @@ -361,8 +362,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, { const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -370,30 +371,28 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(uid, current_fsuid())) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - uid = mapped_kuid_fs(mnt_userns, - i_user_ns(inode), + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pa->e_uid); - if (uid_eq(uid, current_fsuid())) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - gid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(gid)) { + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: - gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pa->e_gid); - if (in_group_p(gid)) { + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; @@ -699,7 +698,7 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; @@ -710,46 +709,127 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - struct user_namespace *mnt_userns, - void *value, size_t size, bool from_user) +static int posix_acl_fix_xattr_common(void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + int count; + + if (!header) + return -EINVAL; + if (size < sizeof(struct posix_acl_xattr_header)) + return -EINVAL; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return -EINVAL; + + count = posix_acl_xattr_count(size); + if (count < 0) + return -EINVAL; + if (count == 0) + return -EINVAL; + + return count; +} + +void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; kuid_t uid; kgid_t gid; - if (!value) + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - if (size < sizeof(struct posix_acl_xattr_header)) + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, + vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, + vfsgid_into_kgid(vfsgid))); + break; + default: + break; + } + } +} + +void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; + kuid_t uid; + kgid_t gid; + + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) return; - if (count == 0) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = VFSUIDT_INIT(uid); + uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = VFSGIDT_INIT(gid); + gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); + break; + default: + break; + } + } +} + +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + kuid_t uid; + kgid_t gid; + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; for (end = entry + count; entry != end; entry++) { switch(le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); - if (from_user) - uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid); - else - uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); - if (from_user) - gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid); - else - gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: @@ -758,34 +838,20 @@ static void posix_acl_fix_xattr_userns( } } -void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_from_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, - size, true); + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); } -void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_to_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, - size, false); + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); } /* diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 14658b009f1b..ffbadb8b3032 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -55,6 +55,7 @@ static void free_pstore_private(struct pstore_private *private) return; if (private->record) { kfree(private->record->buf); + kfree(private->record->priv); kfree(private->record); } kfree(private); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index e26162f102ff..b2fd3c20e7c2 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,11 +28,14 @@ #include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> +#include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/jiffies.h> #include <linux/workqueue.h> +#include <crypto/acompress.h> + #include "internal.h" /* @@ -90,7 +93,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* Compression parameters */ -static struct crypto_comp *tfm; +static struct crypto_acomp *tfm; +static struct acomp_req *creq; struct pstore_zbackend { int (*zbufsize)(size_t size); @@ -268,12 +272,21 @@ static const struct pstore_zbackend zbackends[] = { static int pstore_compress(const void *in, void *out, unsigned int inlen, unsigned int outlen) { + struct scatterlist src, dst; int ret; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; - ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); + sg_init_table(&src, 1); + sg_set_buf(&src, in, inlen); + + sg_init_table(&dst, 1); + sg_set_buf(&dst, out, outlen); + + acomp_request_set_params(creq, &src, &dst, inlen, outlen); + + ret = crypto_acomp_compress(creq); if (ret) { pr_err("crypto_comp_compress failed, ret = %d!\n", ret); return ret; @@ -284,7 +297,7 @@ static int pstore_compress(const void *in, void *out, static void allocate_buf_for_compression(void) { - struct crypto_comp *ctx; + struct crypto_acomp *acomp; int size; char *buf; @@ -296,7 +309,7 @@ static void allocate_buf_for_compression(void) if (!psinfo || tfm) return; - if (!crypto_has_comp(zbackend->name, 0, 0)) { + if (!crypto_has_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC)) { pr_err("Unknown compression: %s\n", zbackend->name); return; } @@ -315,16 +328,24 @@ static void allocate_buf_for_compression(void) return; } - ctx = crypto_alloc_comp(zbackend->name, 0, 0); - if (IS_ERR_OR_NULL(ctx)) { + acomp = crypto_alloc_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(acomp)) { kfree(buf); pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, - PTR_ERR(ctx)); + PTR_ERR(acomp)); + return; + } + + creq = acomp_request_alloc(acomp); + if (!creq) { + crypto_free_acomp(acomp); + kfree(buf); + pr_err("acomp_request_alloc('%s') failed\n", zbackend->name); return; } /* A non-NULL big_oops_buf indicates compression is available. */ - tfm = ctx; + tfm = acomp; big_oops_buf_sz = size; big_oops_buf = buf; @@ -334,7 +355,8 @@ static void allocate_buf_for_compression(void) static void free_buf_for_compression(void) { if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { - crypto_free_comp(tfm); + acomp_request_free(creq); + crypto_free_acomp(tfm); tfm = NULL; } kfree(big_oops_buf); @@ -671,6 +693,8 @@ static void decompress_record(struct pstore_record *record) int ret; int unzipped_len; char *unzipped, *workspace; + struct acomp_req *dreq; + struct scatterlist src, dst; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed) return; @@ -694,16 +718,30 @@ static void decompress_record(struct pstore_record *record) if (!workspace) return; + dreq = acomp_request_alloc(tfm); + if (!dreq) { + kfree(workspace); + return; + } + + sg_init_table(&src, 1); + sg_set_buf(&src, record->buf, record->size); + + sg_init_table(&dst, 1); + sg_set_buf(&dst, workspace, unzipped_len); + + acomp_request_set_params(dreq, &src, &dst, record->size, unzipped_len); + /* After decompression "unzipped_len" is almost certainly smaller. */ - ret = crypto_comp_decompress(tfm, record->buf, record->size, - workspace, &unzipped_len); + ret = crypto_acomp_decompress(dreq); if (ret) { - pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); + pr_err("crypto_acomp_decompress failed, ret = %d!\n", ret); kfree(workspace); return; } /* Append ECC notice to decompressed buffer. */ + unzipped_len = dreq->dlen; memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); @@ -711,6 +749,7 @@ static void decompress_record(struct pstore_record *record) unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, GFP_KERNEL); kfree(workspace); + acomp_request_free(dreq); if (!unzipped) return; @@ -769,6 +808,7 @@ void pstore_get_backend_records(struct pstore_info *psi, if (rc) { /* pstore_mkfile() did not take record, so free it. */ kfree(record->buf); + kfree(record->priv); kfree(record); if (rc != -EEXIST || !quiet) failed++; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 7c8f8feac6c3..017d0d4ad329 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -363,7 +363,7 @@ static int psz_kmsg_recover_data(struct psz_context *cxt) rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf), zone->off); if (rcnt != zone->buffer_size + sizeof(*buf)) - return (int)rcnt < 0 ? (int)rcnt : -EIO; + return rcnt < 0 ? rcnt : -EIO; } return 0; } @@ -372,7 +372,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) { struct pstore_zone_info *info = cxt->pstore_zone_info; struct pstore_zone *zone; - size_t rcnt, len; + ssize_t rcnt, len; struct psz_buffer *buf; struct psz_kmsg_header *hdr; struct timespec64 time = { }; @@ -400,7 +400,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) continue; } else if (rcnt != len) { pr_err("read %s with id %lu failed\n", zone->name, i); - return (int)rcnt < 0 ? (int)rcnt : -EIO; + return rcnt < 0 ? rcnt : -EIO; } if (buf->sig != zone->buffer->sig) { @@ -502,7 +502,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) rcnt = info->read((char *)&tmpbuf, len, zone->off); if (rcnt != len) { pr_debug("read zone %s failed\n", zone->name); - return (int)rcnt < 0 ? (int)rcnt : -EIO; + return rcnt < 0 ? rcnt : -EIO; } if (tmpbuf.sig != zone->buffer->sig) { @@ -544,7 +544,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) rcnt = info->read(buf, len - start, off + start); if (rcnt != len - start) { pr_err("read zone %s failed\n", zone->name); - ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + ret = rcnt < 0 ? rcnt : -EIO; goto free_oldbuf; } @@ -552,7 +552,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) rcnt = info->read(buf + len - start, start, off); if (rcnt != start) { pr_err("read zone %s failed\n", zone->name); - ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + ret = rcnt < 0 ? rcnt : -EIO; goto free_oldbuf; } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a74aef99bd3d..28966da7834e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -79,6 +79,7 @@ #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/sched/mm.h> #include "../internal.h" /* ugh */ #include <linux/uaccess.h> @@ -425,9 +426,11 @@ EXPORT_SYMBOL(mark_info_dirty); int dquot_acquire(struct dquot *dquot) { int ret = 0, ret2 = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) @@ -458,6 +461,7 @@ int dquot_acquire(struct dquot *dquot) smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_iolock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } @@ -469,9 +473,11 @@ EXPORT_SYMBOL(dquot_acquire); int dquot_commit(struct dquot *dquot) { int ret = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); if (!clear_dquot_dirty(dquot)) goto out_lock; /* Inactive dquot can be only if there was error during read/init @@ -481,6 +487,7 @@ int dquot_commit(struct dquot *dquot) else ret = -EIO; out_lock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } @@ -492,9 +499,11 @@ EXPORT_SYMBOL(dquot_commit); int dquot_release(struct dquot *dquot) { int ret = 0, ret2 = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); /* Check whether we are not racing with some other dqget() */ if (dquot_is_busy(dquot)) goto out_dqlock; @@ -510,6 +519,7 @@ int dquot_release(struct dquot *dquot) } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_dqlock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } @@ -2075,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, + struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; struct dquot *dquot; @@ -2085,8 +2096,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!dquot_active(inode)) return 0; - if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){ - dquot = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (i_uid_needs_update(mnt_userns, iattr, inode)) { + kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); + + dquot = dqget(sb, make_kqid_uid(kuid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); @@ -2096,8 +2110,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } transfer_to[USRQUOTA] = dquot; } - if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){ - dquot = dqget(sb, make_kqid_gid(iattr->ia_gid)); + if (i_gid_needs_update(mnt_userns, iattr, inode)) { + kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); + + dquot = dqget(sb, make_kqid_gid(kgid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); diff --git a/fs/read_write.c b/fs/read_write.c index b1b1cdfee9d3..ea59dd0095c2 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -227,12 +227,6 @@ loff_t noop_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(noop_llseek); -loff_t no_llseek(struct file *file, loff_t offset, int whence) -{ - return -ESPIPE; -} -EXPORT_SYMBOL(no_llseek); - loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); @@ -290,14 +284,9 @@ EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { - loff_t (*fn)(struct file *, loff_t, int); - - fn = no_llseek; - if (file->f_mode & FMODE_LSEEK) { - if (file->f_op->llseek) - fn = file->f_op->llseek; - } - return fn(file, offset, whence); + if (!(file->f_mode & FMODE_LSEEK)) + return -ESPIPE; + return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); @@ -1263,6 +1252,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, count, fl); file_end_write(out.file); } else { + if (out.file->f_flags & O_NONBLOCK) + fl |= SPLICE_F_NONBLOCK; + retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } @@ -1397,28 +1389,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_copy_file_range); -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - /* - * Although we now allow filesystems to handle cross sb copy, passing - * a file of the wrong filesystem type to filesystem driver can result - * in an attempt to dereference the wrong type of ->private_data, so - * avoid doing that until we really have a good reason. NFS defines - * several different file_system_type structures, but they all end up - * using the same ->copy_file_range() function pointer. - */ - if (file_out->f_op->copy_file_range && - file_out->f_op->copy_file_range == file_in->f_op->copy_file_range) - return file_out->f_op->copy_file_range(file_in, pos_in, - file_out, pos_out, - len, flags); - - return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); -} - /* * Performs necessary checks before doing a file copy * @@ -1440,6 +1410,24 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, if (ret) return ret; + /* + * We allow some filesystems to handle cross sb copy, but passing + * a file of the wrong filesystem type to filesystem driver can result + * in an attempt to dereference the wrong type of ->private_data, so + * avoid doing that until we really have a good reason. + * + * nfs and cifs define several different file_system_type structures + * and several different sets of file_operations, but they all end up + * using the same ->copy_file_range() function pointer. + */ + if (file_out->f_op->copy_file_range) { + if (file_in->f_op->copy_file_range != + file_out->f_op->copy_file_range) + return -EXDEV; + } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { + return -EXDEV; + } + /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; @@ -1505,26 +1493,41 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, file_start_write(file_out); /* - * Try cloning first, this is supported by more file systems, and - * more efficient if both clone and copy are supported (e.g. NFS). + * Cloning is supported by more file systems, so we implement copy on + * same sb using clone, but for filesystems where both clone and copy + * are supported (e.g. nfs,cifs), we only call the copy method. */ + if (file_out->f_op->copy_file_range) { + ret = file_out->f_op->copy_file_range(file_in, pos_in, + file_out, pos_out, + len, flags); + goto done; + } + if (file_in->f_op->remap_file_range && file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { - loff_t cloned; - - cloned = file_in->f_op->remap_file_range(file_in, pos_in, + ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); - if (cloned > 0) { - ret = cloned; + if (ret > 0) goto done; - } } - ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); - WARN_ON_ONCE(ret == -EOPNOTSUPP); + /* + * We can get here for same sb copy of filesystems that do not implement + * ->copy_file_range() in case filesystem does not support clone or in + * case filesystem supports clone but rejected the clone request (e.g. + * because it was not block aligned). + * + * In both cases, fall back to kernel copy so we are able to maintain a + * consistent story about which filesystems support copy_file_range() + * and which filesystems do not, that will allow userspace tools to + * make consistent desicions w.r.t using copy_file_range(). + */ + ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, + flags); + done: if (ret > 0) { fsnotify_access(file_in); @@ -1649,7 +1652,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_NOWAIT) && + !((iocb->ki_flags & IOCB_DIRECT) || + (file->f_mode & FMODE_BUF_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0cffe054b78e..b9580a6515ee 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -290,7 +290,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh; struct item_head *ih, tmp_ih; b_blocknr_t blocknr; - char *p = NULL; + char *p; int chars; int ret; int result; @@ -305,8 +305,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, result = search_for_position_by_key(inode->i_sb, &key, &path); if (result != POSITION_FOUND) { pathrelse(&path); - if (p) - kunmap(bh_result->b_page); if (result == IO_ERROR) return -EIO; /* @@ -352,8 +350,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, } pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return ret; } /* requested data are in direct item(s) */ @@ -363,8 +359,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * when it is stored in direct item(s) */ pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return -ENOENT; } @@ -396,9 +390,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * sure we need to. But, this means the item might move if * kmap schedules */ - if (!p) - p = (char *)kmap(bh_result->b_page); - + p = (char *)kmap(bh_result->b_page); p += offset; memset(p, 0, inode->i_sb->s_blocksize); do { @@ -2664,7 +2656,7 @@ static int reiserfs_write_full_page(struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); nr++; } put_bh(bh); @@ -2724,7 +2716,7 @@ fail: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); nr++; } put_bh(bh); @@ -3284,7 +3276,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; @@ -3367,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error) goto out; - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index d8cc9a366124..94addfcefede 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -650,7 +650,7 @@ static void submit_logged_buffer(struct buffer_head *bh) BUG(); if (!buffer_uptodate(bh)) BUG(); - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); } static void submit_ordered_buffer(struct buffer_head *bh) @@ -660,7 +660,7 @@ static void submit_ordered_buffer(struct buffer_head *bh) clear_buffer_dirty(bh); if (!buffer_uptodate(bh)) BUG(); - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); } #define CHUNK_SIZE 32 @@ -868,7 +868,7 @@ loop_next: */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); - ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); + ll_rw_block(REQ_OP_WRITE, 1, &bh); spin_lock(lock); } put_bh(bh); @@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s, if (tbh) { if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_WRITE, 0, 1, &tbh); + ll_rw_block(REQ_OP_WRITE, 1, &tbh); reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; @@ -2240,7 +2240,7 @@ abort_replay: } } /* read in the log blocks, memcpy to the corresponding real block */ - ll_rw_block(REQ_OP_READ, 0, get_desc_trans_len(desc), log_blocks); + ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(log_blocks[i]); @@ -2342,7 +2342,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } else bhlist[j++] = bh; } - ll_rw_block(REQ_OP_READ, 0, j, bhlist); + ll_rw_block(REQ_OP_READ, j, bhlist); for (i = 1; i < j; i++) brelse(bhlist[i]); bh = bhlist[0]; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index ef42729216d1..9a293609a022 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s, if (!buffer_uptodate(bh[j])) { if (depth == -1) depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j); } brelse(bh[j]); } @@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, if (!buffer_uptodate(bh) && depth == -1) depth = reiserfs_write_unlock_nested(sb); - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (depth != -1) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index cfb7c44c7366..c88cd2ce0665 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1702,7 +1702,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(REQ_OP_READ, 0, 1, &SB_BUFFER_WITH_SB(s)); + ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s)); wait_on_buffer(SB_BUFFER_WITH_SB(s)); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index bd073836e141..436641369283 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -440,16 +440,9 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n) */ mapping_set_gfp_mask(mapping, GFP_NOFS); page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) kmap(page); - if (PageError(page)) - goto fail; - } return page; - -fail: - reiserfs_put_page(page); - return ERR_PTR(-EIO); } static inline __u32 xattr_hash(const char *msg, int len) diff --git a/fs/remap_range.c b/fs/remap_range.c index e112b5424cdb..046a513dbc3a 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -71,7 +71,8 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, * Otherwise, make sure the count is also block-aligned, having * already confirmed the starting offsets' block alignment. */ - if (pos_in + count == size_in) { + if (pos_in + count == size_in && + (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) { bcount = ALIGN(size_in, bs) - pos_in; } else { if (!IS_ALIGNED(count, bs)) @@ -148,16 +149,7 @@ static int generic_remap_check_len(struct inode *inode_in, /* Read a page's worth of file data into the page cache. */ static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos) { - struct folio *folio; - - folio = read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file); - if (IS_ERR(folio)) - return folio; - if (!folio_test_uptodate(folio)) { - folio_put(folio); - return ERR_PTR(-EIO); - } - return folio; + return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file); } /* diff --git a/fs/splice.c b/fs/splice.c index 047b79db8eb5..93a2c9bf6249 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -814,17 +814,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, { struct pipe_inode_info *pipe; long ret, bytes; - umode_t i_mode; size_t len; int i, flags, more; /* - * We require the input being a regular file, as we don't want to - * randomly drop data for eg socket -> socket splicing. Use the - * piped splicing for that! + * We require the input to be seekable, as we don't want to randomly + * drop data for eg socket -> socket splicing. Use the piped splicing + * for that! */ - i_mode = file_inode(in)->i_mode; - if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + if (unlikely(!(in->f_mode & FMODE_LSEEK))) return -EINVAL; /* diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8e495d8eb86..7f0904b20329 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -454,7 +454,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) int expected = index == file_end ? (i_size_read(inode) & (msblk->block_size - 1)) : msblk->block_size; - int res; + int res = 0; void *pageaddr; TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", @@ -467,14 +467,15 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) if (index < file_end || squashfs_i(inode)->fragment_block == SQUASHFS_INVALID_BLK) { u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - if (bsize < 0) + + res = read_blocklist(inode, index, &block); + if (res < 0) goto error_out; - if (bsize == 0) + if (res == 0) res = squashfs_readpage_sparse(page, expected); else - res = squashfs_readpage_block(page, block, bsize, expected); + res = squashfs_readpage_block(page, block, res, expected); } else res = squashfs_readpage_fragment(page, expected); @@ -488,11 +489,11 @@ out: memset(pageaddr, 0, PAGE_SIZE); kunmap_atomic(pageaddr); flush_dcache_page(page); - if (!PageError(page)) + if (res == 0) SetPageUptodate(page); unlock_page(page); - return 0; + return res; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index de7252715b12..81d26abf486f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -553,7 +553,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) * * Only one instances directory is allowed. * - * The instances directory is special as it allows for mkdir and rmdir to + * The instances directory is special as it allows for mkdir and rmdir * to be done by userspace. When a mkdir or rmdir is performed, the inode * locks are released and the methods passed in (@mkdir and @rmdir) are * called without locks and with the name of the directory being created diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 04ced154960f..f2353dd676ef 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1461,29 +1461,6 @@ static bool ubifs_dirty_folio(struct address_space *mapping, return ret; } -#ifdef CONFIG_MIGRATION -static int ubifs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) -{ - int rc; - - rc = migrate_page_move_mapping(mapping, newpage, page, 0); - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (PagePrivate(page)) { - detach_page_private(page); - attach_page_private(newpage, (void *)1); - } - - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); - return MIGRATEPAGE_SUCCESS; -} -#endif - static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) { struct inode *inode = folio->mapping->host; @@ -1649,10 +1626,8 @@ const struct address_space_operations ubifs_file_address_operations = { .write_end = ubifs_write_end, .invalidate_folio = ubifs_invalidate_folio, .dirty_folio = ubifs_dirty_folio, -#ifdef CONFIG_MIGRATION - .migratepage = ubifs_migrate_page, -#endif - .release_folio = ubifs_release_folio, + .migrate_folio = filemap_migrate_folio, + .release_folio = ubifs_release_folio, }; const struct inode_operations ubifs_file_inode_operations = { diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 42e3e551fa4c..cad3772f9dbe 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 73720320f0ab..a2adf6293093 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index edc88716751a..8d06daed549f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1214,7 +1214,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, if (buffer_uptodate(bh)) return bh; - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 075d3d9114c8..bd810d8239f2 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -296,7 +296,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, oldb + pos); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ufs_error(inode->i_sb, __func__, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index b721d0bda5e5..391efaf1d528 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -193,7 +193,7 @@ static struct page *ufs_get_page(struct inode *dir, unsigned long n) if (!IS_ERR(page)) { kmap(page); if (unlikely(!PageChecked(page))) { - if (PageError(page) || !ufs_check_page(page)) + if (!ufs_check_page(page)) goto fail; } } diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 4fa633f84274..08ddf41eaaad 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -264,17 +264,6 @@ struct page *ufs_get_locked_page(struct address_space *mapping, put_page(page); return NULL; } - - if (!PageUptodate(page) || PageError(page)) { - unlock_page(page); - put_page(page); - - printk(KERN_ERR "ufs_change_blocknr: " - "can not read page: ino %lu, index: %lu\n", - inode->i_ino, index); - - return ERR_PTR(-EIO); - } } if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e943370107d0..de86f5b2859f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -192,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg) } static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned long real_address, unsigned int flags, unsigned long reason, unsigned int features) { struct uffd_msg msg; + msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; - if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) - address &= PAGE_MASK; - msg.arg.pagefault.address = address; + msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? + real_address : address; + /* * These flags indicate why the userfault occurred: * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. @@ -488,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, - ctx->features); + uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, + reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index 54598cd80145..aad1f1d998b9 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -14,11 +14,11 @@ config FS_VERITY help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported - filesystems (currently EXT4 and F2FS), userspace can use an - ioctl to enable verity for a file, which causes the filesystem - to build a Merkle tree for the file. The filesystem will then - transparently verify any data read from the file against the - Merkle tree. The file is also made read-only. + filesystems (currently ext4, f2fs, and btrfs), userspace can + use an ioctl to enable verity for a file, which causes the + filesystem to build a Merkle tree for the file. The filesystem + will then transparently verify any data read from the file + against the Merkle tree. The file is also made read-only. This serves as an integrity check, but the availability of the Merkle tree root hash also allows efficiently supporting diff --git a/fs/xattr.c b/fs/xattr.c index e8dd03e4561e..a1f4998bc6be 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -282,9 +282,15 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); +static inline bool is_posix_acl_xattr(const char *name) +{ + return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); +} + int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) + const char *name, void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -292,12 +298,16 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(mnt_userns, dentry, + (const void **)&value, size); if (error < 0) return error; size = error; } + if (size && is_posix_acl_xattr(name)) + posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); + retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -431,7 +441,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - return __vfs_getxattr(dentry, inode, name, value, size); + error = __vfs_getxattr(dentry, inode, name, value, size); + if (error > 0 && is_posix_acl_xattr(name)) + posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); + return error; } EXPORT_SYMBOL_GPL(vfs_getxattr); @@ -577,8 +590,7 @@ static void setxattr_convert(struct user_namespace *mnt_userns, if (ctx->size && ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), - ctx->kvalue, ctx->size); + posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); } int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, @@ -695,8 +707,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), - ctx->kvalue, error); + posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 836ab1b8ed7b..224649a76cbb 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -50,7 +50,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); -STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); +STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); /* * Internal routines when attribute list is more than one block. @@ -393,16 +393,10 @@ xfs_attr_sf_addname( * It won't fit in the shortform, transform to a leaf block. GROT: * another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp); + error = xfs_attr_shortform_to_leaf(args); if (error) return error; - /* - * Prevent the leaf buffer from being unlocked so that a concurrent AIL - * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. - */ - xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); attr->xattri_dela_state = XFS_DAS_LEAF_ADD; out: trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); @@ -447,11 +441,9 @@ xfs_attr_leaf_addname( /* * Use the leaf buffer we may already hold locked as a result of - * a sf-to-leaf conversion. The held buffer is no longer valid - * after this call, regardless of the result. + * a sf-to-leaf conversion. */ - error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); - attr->xattri_leaf_bp = NULL; + error = xfs_attr_leaf_try_add(args); if (error == -ENOSPC) { error = xfs_attr3_leaf_to_node(args); @@ -497,8 +489,6 @@ xfs_attr_node_addname( struct xfs_da_args *args = attr->xattri_da_args; int error; - ASSERT(!attr->xattri_leaf_bp); - error = xfs_attr_node_addname_find_attr(attr); if (error) return error; @@ -997,9 +987,11 @@ xfs_attr_set( /* * We have no control over the attribute names that userspace passes us * to remove, so we have to allow the name lookup prior to attribute - * removal to fail as well. + * removal to fail as well. Preserve the logged flag, since we need + * to pass that through to the logging code. */ - args->op_flags = XFS_DA_OP_OKNOENT; + args->op_flags = XFS_DA_OP_OKNOENT | + (args->op_flags & XFS_DA_OP_LOGGED); if (args->value) { XFS_STATS_INC(mp, xs_attr_set); @@ -1213,24 +1205,14 @@ xfs_attr_restore_rmt_blk( */ STATIC int xfs_attr_leaf_try_add( - struct xfs_da_args *args, - struct xfs_buf *bp) + struct xfs_da_args *args) { + struct xfs_buf *bp; int error; - /* - * If the caller provided a buffer to us, it is locked and held in - * the transaction because it just did a shortform to leaf conversion. - * Hence we don't need to read it again. Otherwise read in the leaf - * buffer. - */ - if (bp) { - xfs_trans_bhold_release(args->trans, bp); - } else { - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - } + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; /* * Look up the xattr name to set the insertion point for the new xattr. @@ -1439,12 +1421,11 @@ static int xfs_attr_node_try_addname( struct xfs_attr_intent *attr) { - struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; int error; - trace_xfs_attr_node_addname(args); + trace_xfs_attr_node_addname(state->args); blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index e329da3e7afa..dfb47fa63c6d 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -28,16 +28,6 @@ struct xfs_attr_list_context; */ #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ -static inline bool xfs_has_larp(struct xfs_mount *mp) -{ -#ifdef DEBUG - /* Logged xattrs require a V5 super for log_incompat */ - return xfs_has_crc(mp) && xfs_globals.larp; -#else - return false; -#endif -} - /* * Kernel-internal version of the attrlist cursor. */ @@ -525,11 +515,6 @@ struct xfs_attr_intent { */ struct xfs_attri_log_nameval *xattri_nameval; - /* - * Used by xfs_attr_set to hold a leaf buffer across a transaction roll - */ - struct xfs_buf *xattri_leaf_bp; - /* Used to keep track of current state of delayed operation */ enum xfs_delattr_state xattri_dela_state; @@ -624,7 +609,7 @@ static inline enum xfs_delattr_state xfs_attr_init_replace_state(struct xfs_da_args *args) { args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; - if (xfs_has_larp(args->dp->i_mount)) + if (args->op_flags & XFS_DA_OP_LOGGED) return xfs_attr_init_remove_state(args); return xfs_attr_init_add_state(args); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 15a990409463..8f47396f8dd2 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -289,6 +289,23 @@ xfs_attr3_leaf_verify_entry( return NULL; } +/* + * Validate an attribute leaf block. + * + * Empty leaf blocks can occur under the following circumstances: + * + * 1. setxattr adds a new extended attribute to a file; + * 2. The file has zero existing attributes; + * 3. The attribute is too large to fit in the attribute fork; + * 4. The attribute is small enough to fit in a leaf block; + * 5. A log flush occurs after committing the transaction that creates + * the (empty) leaf block; and + * 6. The filesystem goes down after the log flush but before the new + * attribute can be committed to the leaf block. + * + * Hence we need to ensure that we don't fail the validation purely + * because the leaf is empty. + */ static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) @@ -311,15 +328,6 @@ xfs_attr3_leaf_verify( return fa; /* - * Empty leaf blocks should never occur; they imply the existence of a - * software bug that needs fixing. xfs_repair also flags them as a - * corruption that needs fixing, so we should never let these go to - * disk. - */ - if (ichdr.count == 0) - return __this_address; - - /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -922,14 +930,10 @@ xfs_attr_shortform_getvalue( return -ENOATTR; } -/* - * Convert from using the shortform to the leaf. On success, return the - * buffer so that we can keep it locked until we're totally done with it. - */ +/* Convert from using the shortform to the leaf format. */ int xfs_attr_shortform_to_leaf( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) + struct xfs_da_args *args) { struct xfs_inode *dp; struct xfs_attr_shortform *sf; @@ -991,7 +995,6 @@ xfs_attr_shortform_to_leaf( sfe = xfs_attr_sf_nextentry(sfe); } error = 0; - *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; @@ -1530,7 +1533,7 @@ xfs_attr3_leaf_add_work( if (tmp) entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_REPLACE) { - if (!xfs_has_larp(mp)) + if (!(args->op_flags & XFS_DA_OP_LOGGED)) entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index efa757f1e912..368f4d9fa1d5 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -49,8 +49,7 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, - struct xfs_buf **leaf_bp); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); int xfs_attr_sf_findname(struct xfs_da_args *args, struct xfs_attr_sf_entry **sfep, diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index d33b7686a0b3..ffa3df5b2893 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -92,6 +92,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ #define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ #define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -101,7 +102,8 @@ typedef struct xfs_da_args { { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ { XFS_DA_OP_NOTIME, "NOTIME" }, \ { XFS_DA_OP_REMOVE, "REMOVE" }, \ - { XFS_DA_OP_RECOVERY, "RECOVERY" } + { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ + { XFS_DA_OP_LOGGED, "LOGGED" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8ec38b25187b..5d1a995b15f8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -570,7 +570,7 @@ const struct address_space_operations xfs_address_space_operations = { .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = xfs_iomap_swapfile_activate, diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 4a28c2d77070..5077a7ad5646 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -413,18 +413,20 @@ xfs_attr_create_intent( struct xfs_mount *mp = tp->t_mountp; struct xfs_attri_log_item *attrip; struct xfs_attr_intent *attr; + struct xfs_da_args *args; ASSERT(count == 1); - if (!xfs_sb_version_haslogxattrs(&mp->m_sb)) - return NULL; - /* * Each attr item only performs one attribute operation at a time, so * this is a list of one */ attr = list_first_entry_or_null(items, struct xfs_attr_intent, xattri_list); + args = attr->xattri_da_args; + + if (!(args->op_flags & XFS_DA_OP_LOGGED)) + return NULL; /* * Create a buffer to store the attribute name and value. This buffer @@ -432,8 +434,6 @@ xfs_attr_create_intent( * and the lower level xattr log items. */ if (!attr->xattri_nameval) { - struct xfs_da_args *args = attr->xattri_da_args; - /* * Transfer our reference to the name/value buffer to the * deferred work state structure. @@ -576,7 +576,7 @@ xfs_attri_item_recover( struct xfs_trans_res tres; struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; - int error, ret = 0; + int error; int total; int local; struct xfs_attrd_log_item *done_item = NULL; @@ -617,7 +617,10 @@ xfs_attri_item_recover( args->namelen = nv->name.i_len; args->hashval = xfs_da_hashname(args->name, args->namelen); args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; - args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; + args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | + XFS_DA_OP_LOGGED; + + ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb)); switch (attr->xattri_op_flags) { case XFS_ATTRI_OP_FLAGS_SET: @@ -652,29 +655,32 @@ xfs_attri_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - ret = xfs_xattri_finish_update(attr, done_item); - if (ret == -EAGAIN) { - /* There's more work to do, so add it to this transaction */ + error = xfs_xattri_finish_update(attr, done_item); + if (error == -EAGAIN) { + /* + * There's more work to do, so add the intent item to this + * transaction so that we can continue it later. + */ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); - } else - error = ret; + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + if (error) + goto out_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + return 0; + } if (error) { xfs_trans_cancel(tp); goto out_unlock; } error = xfs_defer_ops_capture_and_commit(tp, capture_list); - out_unlock: - if (attr->xattri_leaf_bp) - xfs_buf_relse(attr->xattri_leaf_bp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); out: - if (ret != -EAGAIN) - xfs_attr_free_item(attr); + xfs_attr_free_item(attr); return error; } diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index ae4345b37621..fe21c76f75b8 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -15,7 +15,7 @@ xfs_rw_bdev( sector_t sector, unsigned int count, char *data, - unsigned int op) + enum req_op op) { unsigned int is_vmalloc = is_vmalloc_addr(data); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 52be58372c63..85e1a26c92e8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -686,6 +686,8 @@ xfs_can_free_eofblocks( * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) + end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bf4e60871068..5e8f40d8c052 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1416,7 +1416,7 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int op) + blk_opf_t op) { int page_index; unsigned int total_nr_pages = bp->b_page_count; @@ -1493,7 +1493,7 @@ _xfs_buf_ioapply( struct xfs_buf *bp) { struct blk_plug plug; - int op; + blk_opf_t op; int offset; int size; int i; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5a171c0b244b..8d9b14d2b912 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -410,7 +410,7 @@ restart: spin_unlock(&ip->i_flags_lock); out: - return file_modified(file); + return kiocb_modified(iocb); } static int @@ -700,12 +700,11 @@ xfs_file_buffered_write( bool cleared_space = false; unsigned int iolock; - if (iocb->ki_flags & IOCB_NOWAIT) - return -EOPNOTSUPP; - write_retry: iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; ret = xfs_file_write_checks(iocb, from, &iolock); if (ret) @@ -1165,7 +1164,7 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5269354b1b69..2609825d53ee 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -440,7 +440,7 @@ xfs_inodegc_queue_all( for_each_online_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); } } @@ -1841,8 +1841,8 @@ void xfs_inodegc_worker( struct work_struct *work) { - struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, - work); + struct xfs_inodegc *gc = container_of(to_delayed_work(work), + struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; @@ -1862,19 +1862,29 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately and - * wait for the work to finish. + * Expedite all pending inodegc work to run immediately. This does not wait for + * completion of the work. */ void -xfs_inodegc_flush( +xfs_inodegc_push( struct xfs_mount *mp) { if (!xfs_is_inodegc_enabled(mp)) return; + trace_xfs_inodegc_push(mp, __return_address); + xfs_inodegc_queue_all(mp); +} +/* + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + xfs_inodegc_push(mp); trace_xfs_inodegc_flush(mp, __return_address); - - xfs_inodegc_queue_all(mp); flush_workqueue(mp->m_inodegc_wq); } @@ -2014,6 +2024,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); spin_lock(&ip->i_flags_lock); @@ -2025,19 +2036,26 @@ xfs_inodegc_queue( items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); - put_cpu_ptr(gc); - if (!xfs_is_inodegc_enabled(mp)) + /* + * We queue the work while holding the current CPU so that the work + * is scheduled to run on this CPU. + */ + if (!xfs_is_inodegc_enabled(mp)) { + put_cpu_ptr(gc); return; - - if (xfs_inodegc_want_queue_work(ip, items)) { - trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); } + if (xfs_inodegc_want_queue_work(ip, items)) + queue_delay = 0; + + trace_xfs_inodegc_queue(mp, __return_address); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); + put_cpu_ptr(gc); + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); - flush_work(&gc->work); + flush_delayed_work(&gc->work); } } @@ -2054,7 +2072,7 @@ xfs_inodegc_cpu_dead( unsigned int count = 0; dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_work_sync(&dead_gc->work); + cancel_delayed_work_sync(&dead_gc->work); if (llist_empty(&dead_gc->list)) return; @@ -2073,12 +2091,12 @@ xfs_inodegc_cpu_dead( llist_add_batch(first, last, &gc->list); count += READ_ONCE(gc->items); WRITE_ONCE(gc->items, count); - put_cpu_ptr(gc); if (xfs_is_inodegc_enabled(mp)) { trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); } + put_cpu_ptr(gc); } /* @@ -2173,7 +2191,7 @@ xfs_inodegc_shrinker_scan( unsigned int h = READ_ONCE(gc->shrinker_hits); WRITE_ONCE(gc->shrinker_hits, h + 1); - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); no_items = false; } } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 2e4cfddf8b8e..6cd180721659 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_push(struct xfs_mount *mp); void xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 52d6f2c7d58b..3e1c62ffa4f7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -132,6 +132,26 @@ xfs_ilock_attr_map_shared( } /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, + * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values + * to set in lock_flags. + */ +static inline void +xfs_lock_flags_assert( + uint lock_flags) +{ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + ASSERT(lock_flags != 0); +} + +/* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. @@ -168,18 +188,7 @@ xfs_ilock( { trace_xfs_ilock(ip, lock_flags, _RET_IP_); - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) { down_write_nested(&VFS_I(ip)->i_rwsem, @@ -222,18 +231,7 @@ xfs_ilock_nowait( { trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) { if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) @@ -291,19 +289,7 @@ xfs_iunlock( xfs_inode_t *ip, uint lock_flags) { - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - ASSERT(lock_flags != 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -379,8 +365,8 @@ xfs_isilocked( } if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, - (lock_flags & XFS_IOLOCK_SHARED)); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock, + (lock_flags & XFS_MMAPLOCK_SHARED)); } if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5a364a7d58fd..0d67ff8a8961 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1096,7 +1096,8 @@ xfs_flags2diflags2( { uint64_t di_flags2 = (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | - XFS_DIFLAG2_BIGTIME)); + XFS_DIFLAG2_BIGTIME | + XFS_DIFLAG2_NREXT64)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a393259a3a3..5d50fed291b4 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -664,7 +664,7 @@ xfs_ilock_for_iomap( unsigned flags, unsigned *lockmode) { - unsigned mode = XFS_ILOCK_SHARED; + unsigned int mode = *lockmode; bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* @@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin( int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin( bool eof = false, cow_eof = false, shared = false; int allocfork = XFS_DATA_FORK; int error = 0; + unsigned int lockmode = XFS_ILOCK_EXCL; if (xfs_is_shutdown(mp)) return -EIO; @@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); - xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { @@ -1172,7 +1175,7 @@ xfs_read_iomap_begin( xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 29f5b8b8aca6..a7402f6ea510 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -667,13 +667,15 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = iattr->ia_uid; + uid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = iattr->ia_gid; + gid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { gid = inode->i_gid; @@ -704,13 +706,13 @@ xfs_setattr_nonsize( * didn't have the inode locked, inode's dquot(s) would have changed * also. */ - if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) && - !uid_eq(inode->i_uid, iattr->ia_uid)) { + if (XFS_IS_UQUOTA_ON(mp) && + i_uid_needs_update(mnt_userns, iattr, inode)) { ASSERT(udqp); old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) && - !gid_eq(inode->i_gid, iattr->ia_gid)) { + if (XFS_IS_GQUOTA_ON(mp) && + i_gid_needs_update(mnt_userns, iattr, inode)) { ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(gdqp); old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index cb9105d667db..f9878021e7d0 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -196,7 +196,7 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) } int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, - char *data, unsigned int op); + char *data, enum req_op op); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1e972f884a81..ae904b21e9cc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2092,8 +2092,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i; - xlog_cil_destroy(log); - /* * Cycle all the iclogbuf locks to make sure all log IO completion * is done before we tear down these buffers. @@ -2105,6 +2103,13 @@ xlog_dealloc_log( iclog = iclog->ic_next; } + /* + * Destroy the CIL after waiting for iclog IO completion because an + * iclog EIO error will try to shut down the log, which accesses the + * CIL to wake up the waiters. + */ + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5f7e4e6e33ce..940c8107cbd4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -122,7 +122,7 @@ xlog_do_io( xfs_daddr_t blk_no, unsigned int nbblks, char *data, - unsigned int op) + enum req_op op) { int error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index ba5d42abf66e..d2eaebd85abf 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -61,7 +61,7 @@ struct xfs_error_cfg { */ struct xfs_inodegc { struct llist_head list; - struct work_struct work; + struct delayed_work work; /* approximate count of inodes in the list */ unsigned int items; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 74ac9ca9e119..392cb39cc10c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -454,9 +454,12 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; - /* Flush inodegc work at the start of a quota reporting scan. */ + /* + * Expedite pending inodegc work at the start of a quota reporting + * scan but don't block waiting for it to complete. + */ if (id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); /* * Try to get the dquot. We don't want it allocated on disk, so don't @@ -498,7 +501,7 @@ xfs_qm_scall_getquota_next( /* Flush inodegc work at the start of a quota reporting scan. */ if (*id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ed18160e6181..aa977c7ea370 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -797,8 +797,11 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - /* Wait for whatever inactivations are in progress. */ - xfs_inodegc_flush(mp); + /* + * Expedite background inodegc but don't wait. We do not want to block + * here waiting hours for a billion extent file to be truncated. + */ + xfs_inodegc_push(mp); statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1074,7 +1077,7 @@ xfs_inodegc_init_percpu( gc = per_cpu_ptr(mp->m_inodegc, cpu); init_llist_head(&gc->list); gc->items = 0; - INIT_WORK(&gc->work, xfs_inodegc_worker); + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); } return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d32026585c1b..0fa1b7a2918c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ TP_ARGS(mp, caller_ip)) DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_push); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_inodegc_queue); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 35e13e125ec6..c325a28b89a8 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -68,6 +68,18 @@ xfs_attr_rele_log_assist( xlog_drop_incompat_feat(mp->m_log); } +static inline bool +xfs_attr_want_log_assist( + struct xfs_mount *mp) +{ +#ifdef DEBUG + /* Logged xattrs require a V5 super for log_incompat */ + return xfs_has_crc(mp) && xfs_globals.larp; +#else + return false; +#endif +} + /* * Set or remove an xattr, having grabbed the appropriate logging resources * prior to calling libxfs. @@ -80,11 +92,14 @@ xfs_attr_change( bool use_logging = false; int error; - if (xfs_has_larp(mp)) { + ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED)); + + if (xfs_attr_want_log_assist(mp)) { error = xfs_attr_grab_log_assist(mp); if (error) return error; + args->op_flags |= XFS_DA_OP_LOGGED; use_logging = true; } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index bcb21aea990a..511bb9fa3750 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -60,8 +60,7 @@ static void zonefs_account_active(struct inode *inode) } } -static inline int zonefs_zone_mgmt(struct inode *inode, - enum req_opf op) +static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) { struct zonefs_inode_info *zi = ZONEFS_I(inode); int ret; @@ -110,15 +109,51 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) } } -static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) +static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; loff_t isize; - /* All I/Os should always be within the file maximum size */ + /* + * All blocks are always mapped below EOF. If reading past EOF, + * act as if there is a hole up to the file maximum size. + */ + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_read_iomap_ops = { + .iomap_begin = zonefs_read_iomap_begin, +}; + +static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* All write I/Os should always be within the file maximum size */ if (WARN_ON_ONCE(offset + length > zi->i_max_size)) return -EIO; @@ -128,7 +163,7 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, * operation. */ if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && - (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT))) + !(flags & IOMAP_DIRECT))) return -EIO; /* @@ -137,47 +172,44 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, * write pointer) and unwriten beyond. */ mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; isize = i_size_read(inode); - if (offset >= isize) + if (iomap->offset >= isize) { iomap->type = IOMAP_UNWRITTEN; - else + iomap->length = zi->i_max_size - iomap->offset; + } else { iomap->type = IOMAP_MAPPED; - if (flags & IOMAP_WRITE) - length = zi->i_max_size - offset; - else - length = min(length, isize - offset); + iomap->length = isize - iomap->offset; + } mutex_unlock(&zi->i_truncate_mutex); - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset; - iomap->bdev = inode->i_sb->s_bdev; - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; - trace_zonefs_iomap_begin(inode, iomap); return 0; } -static const struct iomap_ops zonefs_iomap_ops = { - .iomap_begin = zonefs_iomap_begin, +static const struct iomap_ops zonefs_write_iomap_ops = { + .iomap_begin = zonefs_write_iomap_begin, }; static int zonefs_read_folio(struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &zonefs_iomap_ops); + return iomap_read_folio(folio, &zonefs_read_iomap_ops); } static void zonefs_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_read_iomap_ops); } /* * Map blocks for page writeback. This is used only on conventional zone files, * which implies that the page range can only be within the fixed inode size. */ -static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) +static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) { struct zonefs_inode_info *zi = ZONEFS_I(inode); @@ -191,12 +223,12 @@ static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, offset < wpc->iomap.offset + wpc->iomap.length) return 0; - return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset, - IOMAP_WRITE, &wpc->iomap, NULL); + return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, + IOMAP_WRITE, &wpc->iomap, NULL); } static const struct iomap_writeback_ops zonefs_writeback_ops = { - .map_blocks = zonefs_map_blocks, + .map_blocks = zonefs_write_map_blocks, }; static int zonefs_writepage(struct page *page, struct writeback_control *wbc) @@ -226,7 +258,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis, return -EINVAL; } - return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops); + return iomap_swapfile_activate(sis, swap_file, span, + &zonefs_read_iomap_ops); } static const struct address_space_operations zonefs_file_aops = { @@ -237,7 +270,7 @@ static const struct address_space_operations zonefs_file_aops = { .dirty_folio = filemap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .direct_IO = noop_direct_IO, @@ -491,7 +524,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) { struct zonefs_inode_info *zi = ZONEFS_I(inode); loff_t old_isize; - enum req_opf op; + enum req_op op; int ret = 0; /* @@ -582,7 +615,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, !uid_eq(iattr->ia_uid, inode->i_uid)) || ((iattr->ia_valid & ATTR_GID) && !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(inode, iattr); + ret = dquot_transfer(mnt_userns, inode, iattr); if (ret) return ret; } @@ -647,7 +680,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); @@ -746,7 +779,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); bio->bi_iter.bi_sector = zi->i_zsector; bio->bi_ioprio = iocb->ki_ioprio; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) bio->bi_opf |= REQ_FUA; ret = bio_iov_iter_get_pages(bio, from); @@ -899,7 +932,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) if (append) ret = zonefs_file_dio_append(iocb, from); else - ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, &zonefs_write_dio_ops, 0, NULL, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { @@ -948,7 +981,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, if (ret <= 0) goto inode_unlock; - ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); if (ret > 0) iocb->ki_pos += ret; else if (ret == -EIO) @@ -1041,7 +1074,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) goto inode_unlock; } file_accessed(iocb->ki_filp); - ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, + ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, &zonefs_read_dio_ops, 0, NULL, 0); } else { ret = generic_file_read_iter(iocb, to); @@ -1085,7 +1118,8 @@ static int zonefs_seq_file_write_open(struct inode *inode) if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - if (wro > sbi->s_max_wro_seq_files) { + if (sbi->s_max_wro_seq_files + && wro > sbi->s_max_wro_seq_files) { atomic_dec(&sbi->s_wro_seq_files); ret = -EBUSY; goto unlock; @@ -1359,7 +1393,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, { struct super_block *sb = parent->i_sb; - inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1; + inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -1505,7 +1539,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, /* * The first zone contains the super block: skip it. */ - end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk); + end = zd->zones + bdev_nr_zones(sb->s_bdev); for (zone = &zd->zones[1]; zone < end; zone = next) { next = zone + 1; @@ -1600,8 +1634,8 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd) struct block_device *bdev = zd->sb->s_bdev; int ret; - zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk), - sizeof(struct blk_zone), GFP_KERNEL); + zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone), + GFP_KERNEL); if (!zd->zones) return -ENOMEM; @@ -1613,9 +1647,9 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd) return ret; } - if (ret != blkdev_nr_zones(bdev->bd_disk)) { + if (ret != bdev_nr_zones(bdev)) { zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", - ret, blkdev_nr_zones(bdev->bd_disk)); + ret, bdev_nr_zones(bdev)); return -EIO; } @@ -1652,11 +1686,11 @@ static int zonefs_read_super(struct super_block *sb) if (ret) goto free_page; - super = kmap(page); + super = page_address(page); ret = -EINVAL; if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) - goto unmap; + goto free_page; stored_crc = le32_to_cpu(super->s_crc); super->s_crc = 0; @@ -1664,14 +1698,14 @@ static int zonefs_read_super(struct super_block *sb) if (crc != stored_crc) { zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", crc, stored_crc); - goto unmap; + goto free_page; } sbi->s_features = le64_to_cpu(super->s_features); if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { zonefs_err(sb, "Unknown features set 0x%llx\n", sbi->s_features); - goto unmap; + goto free_page; } if (sbi->s_features & ZONEFS_F_UID) { @@ -1679,7 +1713,7 @@ static int zonefs_read_super(struct super_block *sb) le32_to_cpu(super->s_uid)); if (!uid_valid(sbi->s_uid)) { zonefs_err(sb, "Invalid UID feature\n"); - goto unmap; + goto free_page; } } @@ -1688,7 +1722,7 @@ static int zonefs_read_super(struct super_block *sb) le32_to_cpu(super->s_gid)); if (!gid_valid(sbi->s_gid)) { zonefs_err(sb, "Invalid GID feature\n"); - goto unmap; + goto free_page; } } @@ -1697,14 +1731,12 @@ static int zonefs_read_super(struct super_block *sb) if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { zonefs_err(sb, "Reserved area is being used\n"); - goto unmap; + goto free_page; } import_uuid(&sbi->s_uuid, super->s_uuid); ret = 0; -unmap: - kunmap(page); free_page: __free_page(page); @@ -1760,12 +1792,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) atomic_set(&sbi->s_wro_seq_files, 0); sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); - if (!sbi->s_max_wro_seq_files && - sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; - } - atomic_set(&sbi->s_active_seq_files, 0); sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); @@ -1787,8 +1813,15 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) goto cleanup; - zonefs_info(sb, "Mounting %u zones", - blkdev_nr_zones(sb->s_bdev->bd_disk)); + zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); + + if (!sbi->s_max_wro_seq_files && + !sbi->s_max_active_seq_files && + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + zonefs_info(sb, + "No open and active zone limits. Ignoring explicit_open mount option\n"); + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } /* Create root directory inode */ ret = -ENOMEM; @@ -1796,7 +1829,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (!inode) goto cleanup; - inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk); + inode->i_ino = bdev_nr_zones(sb->s_bdev); inode->i_mode = S_IFDIR | 0555; inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); inode->i_op = &zonefs_dir_inode_operations; diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h index f369d7d50303..42edcfd393ed 100644 --- a/fs/zonefs/trace.h +++ b/fs/zonefs/trace.h @@ -20,12 +20,12 @@ #define show_dev(dev) MAJOR(dev), MINOR(dev) TRACE_EVENT(zonefs_zone_mgmt, - TP_PROTO(struct inode *inode, enum req_opf op), + TP_PROTO(struct inode *inode, enum req_op op), TP_ARGS(inode, op), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(int, op) + __field(enum req_op, op) __field(sector_t, sector) __field(sector_t, nr_sectors) ), |