From ce529cc25b184e93397b94a8a322128fc0095cbb Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Wed, 30 Nov 2022 14:04:55 +0800 Subject: erofs: enable large folios for iomap mode Enable large folios for iomap mode. Then the readahead routine will pass down large folios containing multiple pages. Let's enable this for non-compressed format for now, until the compression part supports large folios later. When large folios supported, the iomap routine will allocate iomap_page for each large folio and thus we need iomap_release_folio() and iomap_invalidate_folio() to free iomap_page when these folios get reclaimed or invalidated. Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221130060455.44532-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 2 ++ fs/erofs/inode.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fe8ac0e163f7..c9526c627dda 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -403,6 +403,8 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_readahead, .bmap = erofs_bmap, .direct_IO = noop_direct_IO, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, }; #ifdef CONFIG_FS_DAX diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index ad2a82f2eb4c..e457b8a59ee7 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -295,6 +295,8 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; + if (!erofs_is_fscache_mode(inode->i_sb)) + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; -- cgit v1.2.1 From 27f2a2dcc6261406b509b5022a1e5c23bf622830 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 25 Nov 2022 19:08:22 +0800 Subject: erofs: check the uniqueness of fsid in shared domain in advance When shared domain is enabled, doing mount twice with the same fsid and domain_id will trigger sysfs warning as shown below: sysfs: cannot create duplicate filename '/fs/erofs/d0,meta.bin' CPU: 15 PID: 1051 Comm: mount Not tainted 6.1.0-rc6+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl+0x38/0x49 dump_stack+0x10/0x12 sysfs_warn_dup.cold+0x17/0x27 sysfs_create_dir_ns+0xb8/0xd0 kobject_add_internal+0xb1/0x240 kobject_init_and_add+0x71/0xa0 erofs_register_sysfs+0x89/0x110 erofs_fc_fill_super+0x98c/0xaf0 vfs_get_super+0x7d/0x100 get_tree_nodev+0x16/0x20 erofs_fc_get_tree+0x20/0x30 vfs_get_tree+0x24/0xb0 path_mount+0x2fa/0xa90 do_mount+0x7c/0xa0 __x64_sys_mount+0x8b/0xe0 do_syscall_64+0x30/0x60 entry_SYSCALL_64_after_hwframe+0x46/0xb0 The reason is erofs_fscache_register_cookie() doesn't guarantee the primary data blob (aka fsid) is unique in the shared domain and erofs_register_sysfs() invoked by the second mount will fail due to the duplicated fsid in the shared domain and report warning. It would be better to check the uniqueness of fsid before doing erofs_register_sysfs(), so adding a new flags parameter for erofs_fscache_register_cookie() and doing the uniqueness check if EROFS_REG_COOKIE_NEED_NOEXIST is enabled. After the patch, the error in dmesg for the duplicated mount would be: erofs: ...: erofs_domain_register_cookie: XX already exists in domain YY Reviewed-by: Jia Zhu Reviewed-by: Jingbo Xu Reviewed-by: Chao Yu Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20221125110822.3812942-1-houtao@huaweicloud.com Fixes: 7d41963759fe ("erofs: Support sharing cookies in the same domain") Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 47 +++++++++++++++++++++++++++++++++++------------ fs/erofs/internal.h | 10 ++++++++-- fs/erofs/super.c | 2 +- 3 files changed, 44 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index af5ed6b9c54d..6a792a513d6b 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -494,7 +494,8 @@ static int erofs_fscache_register_domain(struct super_block *sb) static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; @@ -516,7 +517,7 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, fscache_use_cookie(cookie, false); ctx->cookie = cookie; - if (need_inode) { + if (flags & EROFS_REG_COOKIE_NEED_INODE) { struct inode *const inode = new_inode(sb); if (!inode) { @@ -554,14 +555,15 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) static struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { int err; struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; - ctx = erofs_fscache_acquire_cookie(sb, name, need_inode); + ctx = erofs_fscache_acquire_cookie(sb, name, flags); if (IS_ERR(ctx)) return ctx; @@ -589,7 +591,8 @@ out: static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { struct inode *inode; struct erofs_fscache *ctx; @@ -602,23 +605,30 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, ctx = inode->i_private; if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) continue; - igrab(inode); + if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { + igrab(inode); + } else { + erofs_err(sb, "%s already exists in domain %s", name, + domain->domain_id); + ctx = ERR_PTR(-EEXIST); + } spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } spin_unlock(&psb->s_inode_list_lock); - ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode); + ctx = erofs_fscache_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { if (EROFS_SB(sb)->domain_id) - return erofs_domain_register_cookie(sb, name, need_inode); - return erofs_fscache_acquire_cookie(sb, name, need_inode); + return erofs_domain_register_cookie(sb, name, flags); + return erofs_fscache_acquire_cookie(sb, name, flags); } void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) @@ -647,6 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb) int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; + unsigned int flags; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); @@ -655,8 +666,20 @@ int erofs_fscache_register_fs(struct super_block *sb) if (ret) return ret; - /* acquired domain/volume will be relinquished in kill_sb() on error */ - fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true); + /* + * When shared domain is enabled, using NEED_NOEXIST to guarantee + * the primary data blob (aka fsid) is unique in the shared domain. + * + * For non-shared-domain case, fscache_acquire_volume() invoked by + * erofs_fscache_register_volume() has already guaranteed + * the uniqueness of primary data blob. + * + * Acquired domain/volume will be relinquished in kill_sb() on error. + */ + flags = EROFS_REG_COOKIE_NEED_INODE; + if (sbi->domain_id) + flags |= EROFS_REG_COOKIE_NEED_NOEXIST; + fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); if (IS_ERR(fscache)) return PTR_ERR(fscache); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 05dc68627722..e51f27b6bde1 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -604,13 +604,18 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_NEED_INODE 1 +#define EROFS_REG_COOKIE_NEED_NOEXIST 2 + /* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode); + char *name, + unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); extern const struct address_space_operations erofs_fscache_access_aops; @@ -623,7 +628,8 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} static inline struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 1c7dcca702b3..481788c24a68 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -245,7 +245,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, } if (erofs_is_fscache_mode(sb)) { - fscache = erofs_fscache_register_cookie(sb, dif->path, false); + fscache = erofs_fscache_register_cookie(sb, dif->path, 0); if (IS_ERR(fscache)) return PTR_ERR(fscache); dif->fscache = fscache; -- cgit v1.2.1 From 1282dea37b09087b8aec59f0774572c16b52276a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 6 Dec 2022 14:03:52 +0800 Subject: erofs: clean up cached I/O strategies After commit 4c7e42552b3a ("erofs: remove useless cache strategy of DELAYEDALLOC"), only one cached I/O allocation strategy is supported: When cached I/O is preferred, page allocation is applied without direct reclaim. If allocation fails, fall back to inplace I/O. Let's get rid of z_erofs_cache_alloctype. No logical changes. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Yue Hu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20221206060352.152830-1-xiang@kernel.org --- fs/erofs/zdata.c | 77 +++++++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b792d424d774..b66c16473273 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -175,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* how to allocate cached pages for a pcluster */ -enum z_erofs_cache_alloctype { - DONTALLOC, /* don't allocate any cached pages */ - /* - * try to use cached I/O if page allocation succeeds or fallback - * to in-place I/O instead to avoid any direct reclaim. - */ - TRYALLOC, -}; - /* * tagged pointer with 1-bit tag for all compressed pages * tag 0 - the page is just found with an extra page reference @@ -292,12 +282,29 @@ struct z_erofs_decompress_frontend { .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } +static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) +{ + unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; + + if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) + return false; + + if (fe->backmost) + return true; + + if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && + fe->map.m_la < fe->headoffset) + return true; + + return false; +} + static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - enum z_erofs_cache_alloctype type, struct page **pagepool) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* * optimistic allocation without direct reclaim since inplace I/O @@ -326,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } else { /* I/O is needed, no possible to decompress directly */ standalone = false; - switch (type) { - case TRYALLOC: - newpage = erofs_allocpage(pagepool, gfp); - if (!newpage) - continue; - set_page_private(newpage, - Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); - break; - default: /* DONTALLOC */ + if (!shouldalloc) continue; - } + + /* + * try to use cached I/O if page allocation + * succeeds or fallback to in-place I/O instead + * to avoid any direct reclaim. + */ + newpage = erofs_allocpage(pagepool, gfp); + if (!newpage) + continue; + set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); + t = tag_compressed_page_justfound(newpage); } if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, @@ -637,20 +645,6 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) return true; } -static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, - unsigned int cachestrategy, - erofs_off_t la) -{ - if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) - return false; - - if (fe->backmost) - return true; - - return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && - la < fe->headoffset; -} - static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, struct page *page, unsigned int pageofs, unsigned int len) @@ -687,12 +681,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - - enum z_erofs_cache_alloctype cache_strategy; unsigned int cur, end, spiltted; int err = 0; @@ -746,13 +737,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, - map->m_la)) - cache_strategy = TRYALLOC; - else - cache_strategy = DONTALLOC; - - z_erofs_bind_cache(fe, cache_strategy, pagepool); + z_erofs_bind_cache(fe, pagepool); } hitted: /* -- cgit v1.2.1 From 8669247524c73e16e4d3384c4ff882e5c5d06194 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 24 Nov 2022 11:42:11 +0800 Subject: fscache,cachefiles: add prepare_ondemand_read() callback Add prepare_ondemand_read() callback dedicated for the on-demand read scenario, so that callers from this scenario can be decoupled from netfs_io_subrequest. The original cachefiles_prepare_read() is now refactored to a generic routine accepting a parameter list instead of netfs_io_subrequest. There's no logic change, except that the debug id of subrequest and request is removed from trace_cachefiles_prep_read(). Reviewed-by: Jeff Layton Signed-off-by: Jingbo Xu Acked-by: David Howells Link: https://lore.kernel.org/r/20221124034212.81892-2-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/cachefiles/io.c | 77 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 000a28f46e59..175a25fcade8 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres, term_func, term_func_priv); } -/* - * Prepare a read operation, shortening it to a cached/uncached - * boundary as appropriate. - */ -static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, - loff_t i_size) +static inline enum netfs_io_source +cachefiles_do_prepare_read(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, loff_t i_size, + unsigned long *_flags, ino_t netfs_ino) { enum cachefiles_prepare_read_trace why; - struct netfs_io_request *rreq = subreq->rreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct cachefiles_object *object; + struct cachefiles_object *object = NULL; struct cachefiles_cache *cache; struct fscache_cookie *cookie = fscache_cres_cookie(cres); const struct cred *saved_cred; struct file *file = cachefiles_cres_file(cres); enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER; + size_t len = *_len; loff_t off, to; ino_t ino = file ? file_inode(file)->i_ino : 0; int rc; - _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); + _enter("%zx @%llx/%llx", len, start, i_size); - if (subreq->start >= i_size) { + if (start >= i_size) { ret = NETFS_FILL_WITH_ZEROES; why = cachefiles_trace_read_after_eof; goto out_no_object; } if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { - __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); why = cachefiles_trace_read_no_data; - if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) + if (!test_bit(NETFS_SREQ_ONDEMAND, _flags)) goto out_no_object; } @@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest * retry: off = cachefiles_inject_read_error(); if (off == 0) - off = vfs_llseek(file, subreq->start, SEEK_DATA); + off = vfs_llseek(file, start, SEEK_DATA); if (off < 0 && off >= (loff_t)-MAX_ERRNO) { if (off == (loff_t)-ENXIO) { why = cachefiles_trace_read_seek_nxio; @@ -449,21 +446,22 @@ retry: goto out; } - if (off >= subreq->start + subreq->len) { + if (off >= start + len) { why = cachefiles_trace_read_found_hole; goto download_and_store; } - if (off > subreq->start) { + if (off > start) { off = round_up(off, cache->bsize); - subreq->len = off - subreq->start; + len = off - start; + *_len = len; why = cachefiles_trace_read_found_part; goto download_and_store; } to = cachefiles_inject_read_error(); if (to == 0) - to = vfs_llseek(file, subreq->start, SEEK_HOLE); + to = vfs_llseek(file, start, SEEK_HOLE); if (to < 0 && to >= (loff_t)-MAX_ERRNO) { trace_cachefiles_io_error(object, file_inode(file), to, cachefiles_trace_seek_error); @@ -471,12 +469,13 @@ retry: goto out; } - if (to < subreq->start + subreq->len) { - if (subreq->start + subreq->len >= i_size) + if (to < start + len) { + if (start + len >= i_size) to = round_up(to, cache->bsize); else to = round_down(to, cache->bsize); - subreq->len = to - subreq->start; + len = to - start; + *_len = len; } why = cachefiles_trace_read_have_data; @@ -484,12 +483,11 @@ retry: goto out; download_and_store: - __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) { - rc = cachefiles_ondemand_read(object, subreq->start, - subreq->len); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); + if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) { + rc = cachefiles_ondemand_read(object, start, len); if (!rc) { - __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags); + __clear_bit(NETFS_SREQ_ONDEMAND, _flags); goto retry; } ret = NETFS_INVALID_READ; @@ -497,10 +495,34 @@ download_and_store: out: cachefiles_end_secure(cache, saved_cred); out_no_object: - trace_cachefiles_prep_read(subreq, ret, why, ino); + trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino); return ret; } +/* + * Prepare a read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, + loff_t i_size) +{ + return cachefiles_do_prepare_read(&subreq->rreq->cache_resources, + subreq->start, &subreq->len, i_size, + &subreq->flags, subreq->rreq->inode->i_ino); +} + +/* + * Prepare an on-demand read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_io_source +cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, loff_t i_size, + unsigned long *_flags, ino_t ino) +{ + return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino); +} + /* * Prepare for a write to occur. */ @@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { .write = cachefiles_write, .prepare_read = cachefiles_prepare_read, .prepare_write = cachefiles_prepare_write, + .prepare_ondemand_read = cachefiles_prepare_ondemand_read, .query_occupancy = cachefiles_query_occupancy, }; -- cgit v1.2.1 From 709fe09e281776b5e024fb5934c0485a866b7468 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 24 Nov 2022 11:42:12 +0800 Subject: erofs: switch to prepare_ondemand_read() in fscache mode Switch to prepare_ondemand_read() interface and a self-contained request completion to get rid of netfs_io_[request|subrequest]. The whole request will still be split into slices (subrequest) according to the cache state of the backing file. As long as one of the subrequests fails, the whole request will be marked as failed. Reviewed-by: Gao Xiang Signed-off-by: Jingbo Xu Reviewed-by: Jia Zhu Link: https://lore.kernel.org/r/20221124034212.81892-3-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 261 +++++++++++++++++++---------------------------------- 1 file changed, 94 insertions(+), 167 deletions(-) (limited to 'fs') diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 6a792a513d6b..3e794891cd91 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -11,257 +11,180 @@ static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); static struct vfsmount *erofs_pseudo_mnt; -static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, +struct erofs_fscache_request { + struct netfs_cache_resources cache_resources; + struct address_space *mapping; /* The mapping being accessed */ + loff_t start; /* Start position */ + size_t len; /* Length of the request */ + size_t submitted; /* Length of submitted */ + short error; /* 0 or error that occurred */ + refcount_t ref; +}; + +static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping, loff_t start, size_t len) { - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; - rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); - if (!rreq) + req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL); + if (!req) return ERR_PTR(-ENOMEM); - rreq->start = start; - rreq->len = len; - rreq->mapping = mapping; - rreq->inode = mapping->host; - INIT_LIST_HEAD(&rreq->subrequests); - refcount_set(&rreq->ref, 1); - return rreq; -} - -static void erofs_fscache_put_request(struct netfs_io_request *rreq) -{ - if (!refcount_dec_and_test(&rreq->ref)) - return; - if (rreq->cache_resources.ops) - rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); -} - -static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq) -{ - if (!refcount_dec_and_test(&subreq->ref)) - return; - erofs_fscache_put_request(subreq->rreq); - kfree(subreq); -} + req->mapping = mapping; + req->start = start; + req->len = len; + refcount_set(&req->ref, 1); -static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - - while (!list_empty(&rreq->subrequests)) { - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - list_del(&subreq->rreq_link); - erofs_fscache_put_subrequest(subreq); - } + return req; } -static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq) +static void erofs_fscache_req_complete(struct erofs_fscache_request *req) { - struct netfs_io_subrequest *subreq; struct folio *folio; - unsigned int iopos = 0; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - bool subreq_failed = false; + bool failed = req->error; + pgoff_t start_page = req->start / PAGE_SIZE; + pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1; - XA_STATE(xas, &rreq->mapping->i_pages, start_page); - - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - subreq_failed = (subreq->error < 0); + XA_STATE(xas, &req->mapping->i_pages, start_page); rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos, pgend; - bool pg_failed = false; - if (xas_retry(&xas, folio)) continue; - - pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - pgend = pgpos + folio_size(folio); - - for (;;) { - if (!subreq) { - pg_failed = true; - break; - } - - pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) - break; - - iopos += subreq->len; - if (!list_is_last(&subreq->rreq_link, - &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - if (pgend == iopos) - break; - } - - if (!pg_failed) + if (!failed) folio_mark_uptodate(folio); - folio_unlock(folio); } rcu_read_unlock(); + + if (req->cache_resources.ops) + req->cache_resources.ops->end_operation(&req->cache_resources); + + kfree(req); } -static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq) +static void erofs_fscache_req_put(struct erofs_fscache_request *req) { - erofs_fscache_rreq_unlock_folios(rreq); - erofs_fscache_clear_subrequests(rreq); - erofs_fscache_put_request(rreq); + if (refcount_dec_and_test(&req->ref)) + erofs_fscache_req_complete(req); } -static void erofc_fscache_subreq_complete(void *priv, +static void erofs_fscache_subreq_complete(void *priv, ssize_t transferred_or_error, bool was_async) { - struct netfs_io_subrequest *subreq = priv; - struct netfs_io_request *rreq = subreq->rreq; + struct erofs_fscache_request *req = priv; if (IS_ERR_VALUE(transferred_or_error)) - subreq->error = transferred_or_error; - - if (atomic_dec_and_test(&rreq->nr_outstanding)) - erofs_fscache_rreq_complete(rreq); - - erofs_fscache_put_subrequest(subreq); + req->error = transferred_or_error; + erofs_fscache_req_put(req); } /* - * Read data from fscache and fill the read data into page cache described by - * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes - * the start physical address in the cache file. + * Read data from fscache (cookie, pstart, len), and fill the read data into + * page cache described by (req->mapping, lstart, len). @pstart describeis the + * start physical address in the cache file. */ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, - struct netfs_io_request *rreq, loff_t pstart) + struct erofs_fscache_request *req, loff_t pstart, size_t len) { enum netfs_io_source source; - struct super_block *sb = rreq->mapping->host->i_sb; - struct netfs_io_subrequest *subreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; + struct super_block *sb = req->mapping->host->i_sb; + struct netfs_cache_resources *cres = &req->cache_resources; struct iov_iter iter; - loff_t start = rreq->start; - size_t len = rreq->len; + loff_t lstart = req->start + req->submitted; size_t done = 0; int ret; - atomic_set(&rreq->nr_outstanding, 1); + DBG_BUGON(len > req->len - req->submitted); ret = fscache_begin_read_operation(cres, cookie); if (ret) - goto out; + return ret; while (done < len) { - subreq = kzalloc(sizeof(struct netfs_io_subrequest), - GFP_KERNEL); - if (subreq) { - INIT_LIST_HEAD(&subreq->rreq_link); - refcount_set(&subreq->ref, 2); - subreq->rreq = rreq; - refcount_inc(&rreq->ref); - } else { - ret = -ENOMEM; - goto out; - } - - subreq->start = pstart + done; - subreq->len = len - done; - subreq->flags = 1 << NETFS_SREQ_ONDEMAND; + loff_t sstart = pstart + done; + size_t slen = len - done; + unsigned long flags = 1 << NETFS_SREQ_ONDEMAND; - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - - source = cres->ops->prepare_read(subreq, LLONG_MAX); - if (WARN_ON(subreq->len == 0)) + source = cres->ops->prepare_ondemand_read(cres, + sstart, &slen, LLONG_MAX, &flags, 0); + if (WARN_ON(slen == 0)) source = NETFS_INVALID_READ; if (source != NETFS_READ_FROM_CACHE) { - erofs_err(sb, "failed to fscache prepare_read (source %d)", - source); - ret = -EIO; - subreq->error = ret; - erofs_fscache_put_subrequest(subreq); - goto out; + erofs_err(sb, "failed to fscache prepare_read (source %d)", source); + return -EIO; } - atomic_inc(&rreq->nr_outstanding); + refcount_inc(&req->ref); + iov_iter_xarray(&iter, READ, &req->mapping->i_pages, + lstart + done, slen); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, - start + done, subreq->len); - - ret = fscache_read(cres, subreq->start, &iter, - NETFS_READ_HOLE_FAIL, - erofc_fscache_subreq_complete, subreq); + ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL, + erofs_fscache_subreq_complete, req); if (ret == -EIOCBQUEUED) ret = 0; if (ret) { erofs_err(sb, "failed to fscache_read (ret %d)", ret); - goto out; + return ret; } - done += subreq->len; + done += slen; } -out: - if (atomic_dec_and_test(&rreq->nr_outstanding)) - erofs_fscache_rreq_complete(rreq); - - return ret; + DBG_BUGON(done != len); + req->submitted += len; + return 0; } static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; struct erofs_map_dev mdev = { .m_deviceid = 0, .m_pa = folio_pos(folio), }; ret = erofs_map_dev(sb, &mdev); - if (ret) - goto out; + if (ret) { + folio_unlock(folio); + return ret; + } - rreq = erofs_fscache_alloc_request(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio_mapping(folio), folio_pos(folio), folio_size(folio)); - if (IS_ERR(rreq)) { - ret = PTR_ERR(rreq); - goto out; + if (IS_ERR(req)) { + folio_unlock(folio); + return PTR_ERR(req); } - return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa); -out: - folio_unlock(folio); + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa, folio_size(folio)); + if (ret) + req->error = ret; + + erofs_fscache_req_put(req); return ret; } /* * Read into page cache in the range described by (@pos, @len). * - * On return, the caller is responsible for page unlocking if the output @unlock - * is true, or the callee will take this responsibility through netfs_io_request - * interface. + * On return, if the output @unlock is true, the caller is responsible for page + * unlocking; otherwise the callee will take this responsibility through request + * completion. * * The return value is the number of bytes successfully handled, or negative * error code on failure. The only exception is that, the length of the range - * instead of the error code is returned on failure after netfs_io_request is - * allocated, so that .readahead() could advance rac accordingly. + * instead of the error code is returned on failure after request is allocated, + * so that .readahead() could advance rac accordingly. */ static int erofs_fscache_data_read(struct address_space *mapping, loff_t pos, size_t len, bool *unlock) { struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; struct erofs_map_blocks map; struct erofs_map_dev mdev; struct iov_iter iter; @@ -318,13 +241,17 @@ static int erofs_fscache_data_read(struct address_space *mapping, if (ret) return ret; - rreq = erofs_fscache_alloc_request(mapping, pos, count); - if (IS_ERR(rreq)) - return PTR_ERR(rreq); + req = erofs_fscache_req_alloc(mapping, pos, count); + if (IS_ERR(req)) + return PTR_ERR(req); *unlock = false; - erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa + (pos - map.m_la)); + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa + (pos - map.m_la), count); + if (ret) + req->error = ret; + + erofs_fscache_req_put(req); return count; } -- cgit v1.2.1 From be62c5198861156d77b60babb89e70e21c73eb7b Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 1 Dec 2022 15:42:55 +0800 Subject: erofs: support large folios for fscache mode When large folios supported, one folio can be split into several slices, each of which may be mapped to META/UNMAPPED/MAPPED, and the folio can be unlocked as a whole only when all slices have completed. Thus always allocate erofs_fscache_request for each .read_folio() or .readahead(), in which case the allocated request is responsible for unlocking folios when all slices have completed. As described above, each folio or folio range can be mapped into several slices, while these slices may be mapped to different cookies, and thus each slice needs its own netfs_cache_resources. Here we introduce chained requests to support this, where each .read_folio() or .readahead() calling can correspond to multiple requests. Each request has its own netfs_cache_resources and thus is used to access one cookie. Among these requests, there's a primary request, with the others pointing to the primary request. Signed-off-by: Jingbo Xu Reviewed-by: Jia Zhu Link: https://lore.kernel.org/r/20221201074256.16639-2-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 148 +++++++++++++++++++++++++++++------------------------ 1 file changed, 80 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 3e794891cd91..f14886c479bd 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -12,6 +12,7 @@ static LIST_HEAD(erofs_domain_list); static struct vfsmount *erofs_pseudo_mnt; struct erofs_fscache_request { + struct erofs_fscache_request *primary; struct netfs_cache_resources cache_resources; struct address_space *mapping; /* The mapping being accessed */ loff_t start; /* Start position */ @@ -38,6 +39,26 @@ static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_spac return req; } +static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary, + size_t len) +{ + struct erofs_fscache_request *req; + + /* use primary request for the first submission */ + if (!primary->submitted) { + refcount_inc(&primary->ref); + return primary; + } + + req = erofs_fscache_req_alloc(primary->mapping, + primary->start + primary->submitted, len); + if (!IS_ERR(req)) { + req->primary = primary; + refcount_inc(&primary->ref); + } + return req; +} + static void erofs_fscache_req_complete(struct erofs_fscache_request *req) { struct folio *folio; @@ -56,17 +77,19 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req) folio_unlock(folio); } rcu_read_unlock(); - - if (req->cache_resources.ops) - req->cache_resources.ops->end_operation(&req->cache_resources); - - kfree(req); } static void erofs_fscache_req_put(struct erofs_fscache_request *req) { - if (refcount_dec_and_test(&req->ref)) - erofs_fscache_req_complete(req); + if (refcount_dec_and_test(&req->ref)) { + if (req->cache_resources.ops) + req->cache_resources.ops->end_operation(&req->cache_resources); + if (!req->primary) + erofs_fscache_req_complete(req); + else + erofs_fscache_req_put(req->primary); + kfree(req); + } } static void erofs_fscache_subreq_complete(void *priv, @@ -74,8 +97,12 @@ static void erofs_fscache_subreq_complete(void *priv, { struct erofs_fscache_request *req = priv; - if (IS_ERR_VALUE(transferred_or_error)) - req->error = transferred_or_error; + if (IS_ERR_VALUE(transferred_or_error)) { + if (req->primary) + req->primary->error = transferred_or_error; + else + req->error = transferred_or_error; + } erofs_fscache_req_put(req); } @@ -131,7 +158,6 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, done += slen; } DBG_BUGON(done != len); - req->submitted += len; return 0; } @@ -167,32 +193,19 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) return ret; } -/* - * Read into page cache in the range described by (@pos, @len). - * - * On return, if the output @unlock is true, the caller is responsible for page - * unlocking; otherwise the callee will take this responsibility through request - * completion. - * - * The return value is the number of bytes successfully handled, or negative - * error code on failure. The only exception is that, the length of the range - * instead of the error code is returned on failure after request is allocated, - * so that .readahead() could advance rac accordingly. - */ -static int erofs_fscache_data_read(struct address_space *mapping, - loff_t pos, size_t len, bool *unlock) +static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) { + struct address_space *mapping = primary->mapping; struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct erofs_fscache_request *req; struct erofs_map_blocks map; struct erofs_map_dev mdev; struct iov_iter iter; + loff_t pos = primary->start + primary->submitted; size_t count; int ret; - *unlock = true; - map.m_la = pos; ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); if (ret) @@ -220,17 +233,19 @@ static int erofs_fscache_data_read(struct address_space *mapping, } iov_iter_zero(PAGE_SIZE - size, &iter); erofs_put_metabuf(&buf); - return PAGE_SIZE; + primary->submitted += PAGE_SIZE; + return 0; } + count = primary->len - primary->submitted; if (!(map.m_flags & EROFS_MAP_MAPPED)) { - count = len; iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); - return count; + primary->submitted += count; + return 0; } - count = min_t(size_t, map.m_llen - (pos - map.m_la), len); + count = min_t(size_t, map.m_llen - (pos - map.m_la), count); DBG_BUGON(!count || count % PAGE_SIZE); mdev = (struct erofs_map_dev) { @@ -241,68 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping, if (ret) return ret; - req = erofs_fscache_req_alloc(mapping, pos, count); + req = erofs_fscache_req_chain(primary, count); if (IS_ERR(req)) return PTR_ERR(req); - *unlock = false; ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, req, mdev.m_pa + (pos - map.m_la), count); - if (ret) - req->error = ret; - erofs_fscache_req_put(req); - return count; + primary->submitted += count; + return ret; } -static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +static int erofs_fscache_data_read(struct erofs_fscache_request *req) { - bool unlock; int ret; - DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + do { + ret = erofs_fscache_data_read_slice(req); + if (ret) + req->error = ret; + } while (!ret && req->submitted < req->len); - ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio), - folio_size(folio), &unlock); - if (unlock) { - if (ret > 0) - folio_mark_uptodate(folio); + return ret; +} + +static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fscache_request *req; + int ret; + + req = erofs_fscache_req_alloc(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); + if (IS_ERR(req)) { folio_unlock(folio); + return PTR_ERR(req); } - return ret < 0 ? ret : 0; + + ret = erofs_fscache_data_read(req); + erofs_fscache_req_put(req); + return ret; } static void erofs_fscache_readahead(struct readahead_control *rac) { - struct folio *folio; - size_t len, done = 0; - loff_t start, pos; - bool unlock; - int ret, size; + struct erofs_fscache_request *req; if (!readahead_count(rac)) return; - start = readahead_pos(rac); - len = readahead_length(rac); + req = erofs_fscache_req_alloc(rac->mapping, + readahead_pos(rac), readahead_length(rac)); + if (IS_ERR(req)) + return; - do { - pos = start + done; - ret = erofs_fscache_data_read(rac->mapping, pos, - len - done, &unlock); - if (ret <= 0) - return; + /* The request completion will drop refs on the folios. */ + while (readahead_folio(rac)) + ; - size = ret; - while (size) { - folio = readahead_folio(rac); - size -= folio_size(folio); - if (unlock) { - folio_mark_uptodate(folio); - folio_unlock(folio); - } - } - } while ((done += ret) < len); + erofs_fscache_data_read(req); + erofs_fscache_req_put(req); } static const struct address_space_operations erofs_fscache_meta_aops = { -- cgit v1.2.1 From e6687b89225ee9c817e6dcbadc873f6a4691e5c2 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 1 Dec 2022 15:42:56 +0800 Subject: erofs: enable large folios for fscache mode Enable large folios for fscache mode. Enable this feature for non-compressed format for now, until the compression part supports large folios later. One thing worth noting is that, the feature is not enabled for the meta data routine since meta inodes don't need large folios for now, nor do they support readahead yet. Also document this new feature. Signed-off-by: Jingbo Xu Reviewed-by: Jia Zhu Link: https://lore.kernel.org/r/20221201074256.16639-3-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index e457b8a59ee7..85932086d23f 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -295,8 +295,7 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; - if (!erofs_is_fscache_mode(inode->i_sb)) - mapping_set_large_folios(inode->i_mapping); + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; -- cgit v1.2.1 From 927e5010ff5bd7446a22c511ab8643b9385ddf4d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 18 Oct 2022 18:53:13 +0800 Subject: erofs: use kmap_local_page() only for erofs_bread() Convert all mapped erofs_bread() users to use kmap_local_page() instead of kmap() or kmap_atomic(). Signed-off-by: Gao Xiang Reviewed-and-tested-by: Jingbo Xu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221018105313.4940-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 8 ++------ fs/erofs/inode.c | 1 + fs/erofs/internal.h | 3 +-- fs/erofs/xattr.c | 8 ++++---- fs/erofs/zmap.c | 4 ++-- 5 files changed, 10 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/erofs/data.c b/fs/erofs/data.c index c9526c627dda..f57f921683d7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -13,9 +13,7 @@ void erofs_unmap_metabuf(struct erofs_buf *buf) { if (buf->kmap_type == EROFS_KMAP) - kunmap(buf->page); - else if (buf->kmap_type == EROFS_KMAP_ATOMIC) - kunmap_atomic(buf->base); + kunmap_local(buf->base); buf->base = NULL; buf->kmap_type = EROFS_NO_KMAP; } @@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, } if (buf->kmap_type == EROFS_NO_KMAP) { if (type == EROFS_KMAP) - buf->base = kmap(page); - else if (type == EROFS_KMAP_ATOMIC) - buf->base = kmap_atomic(page); + buf->base = kmap_local_page(page); buf->kmap_type = type; } else if (buf->kmap_type != type) { DBG_BUGON(1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 85932086d23f..5b3a793103af 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode) case S_IFDIR: inode->i_op = &erofs_dir_iops; inode->i_fop = &erofs_dir_fops; + inode_nohighmem(inode); break; case S_IFLNK: err = erofs_fill_symlink(inode, kaddr, ofs); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index e51f27b6bde1..bb8501c0ff5b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -255,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ - EROFS_KMAP, /* use kmap() to map the buffer */ - EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */ + EROFS_KMAP, /* use kmap_local_page() to map the buffer */ }; struct erofs_buf { diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 8106bcb5a38d..a62fb8a3318a 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it) it->blkaddr += erofs_blknr(it->ofs); it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); it->ofs = erofs_blkoff(it->ofs); @@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); return vi->xattr_isize - xattr_header_sz; @@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); it->it.blkaddr = blkaddr; @@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it) it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); it->it.blkaddr = blkaddr; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 0bb66927e3d0..749a5ac943f4 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -178,7 +178,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int advise, type; m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP_ATOMIC); + erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); @@ -416,7 +416,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, out: pos += lcn * (1 << amortizedshift); m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP_ATOMIC); + erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); -- cgit v1.2.1 From c42c0ffe81176940bd5dead474216b7198d77675 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Mon, 5 Dec 2022 11:49:57 +0800 Subject: erofs: Fix pcluster memleak when its block address is zero syzkaller reported a memleak: https://syzkaller.appspot.com/bug?id=62f37ff612f0021641eda5b17f056f1668aa9aed unreferenced object 0xffff88811009c7f8 (size 136): ... backtrace: [] z_erofs_do_read_page+0x99b/0x1740 [] z_erofs_readahead+0x24e/0x580 [] read_pages+0x86/0x3d0 ... syzkaller constructed a case: in z_erofs_register_pcluster(), ztailpacking = false and map->m_pa = zero. This makes pcl->obj.index be zero although pcl is not a inline pcluster. Then following path adds refcount for grp, but the refcount won't be put because pcl is inline. z_erofs_readahead() z_erofs_do_read_page() # for another page z_erofs_collector_begin() erofs_find_workgroup() erofs_workgroup_get() Since it's illegal for the block address of a non-inlined pcluster to be zero, add check here to avoid registering the pcluster which would be leaked. Fixes: cecf864d3d76 ("erofs: support inline data decompression") Reported-by: syzbot+6f8cd9a0155b366d227f@syzkaller.appspotmail.com Signed-off-by: Chen Zhongjin Reviewed-by: Yue Hu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/Y42Kz6sVkf+XqJRB@debian Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b66c16473273..ccf7c55d477f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -496,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) struct erofs_workgroup *grp; int err; - if (!(map->m_flags & EROFS_MAP_ENCODED)) { + if (!(map->m_flags & EROFS_MAP_ENCODED) || + (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { DBG_BUGON(1); return -EFSCORRUPTED; } -- cgit v1.2.1 From d5d188b8f8b38d3d71dd05993874b4fc9284ce95 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 5 Dec 2022 23:00:49 +0800 Subject: erofs: fix missing unmap if z_erofs_get_extent_compressedlen() fails Otherwise, meta buffers could be leaked. Fixes: cec6e93beadf ("erofs: support parsing big pcluster compress indexes") Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20221205150050.47784-1-hsiangkao@linux.alibaba.com --- fs/erofs/zmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 749a5ac943f4..98eff1259de4 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -694,7 +694,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) - goto out; + goto unmap_out; } if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { @@ -718,14 +718,12 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; } + unmap_out: erofs_unmap_metabuf(&m.map->buf); - -out: erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", __func__, map->m_la, map->m_pa, map->m_llen, map->m_plen, map->m_flags); - return err; } -- cgit v1.2.1 From c505feba4c0d76084e56ec498ce819f02a7043ae Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 5 Dec 2022 23:00:50 +0800 Subject: erofs: validate the extent length for uncompressed pclusters syzkaller reported a KASAN use-after-free: https://syzkaller.appspot.com/bug?extid=2ae90e873e97f1faf6f2 The referenced fuzzed image actually has two issues: - m_pa == 0 as a non-inlined pcluster; - The logical length is longer than its physical length. The first issue has already been addressed. This patch addresses the second issue by checking the extent length validity. Reported-by: syzbot+2ae90e873e97f1faf6f2@syzkaller.appspotmail.com Fixes: 02827e1796b3 ("staging: erofs: add erofs_map_blocks_iter") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20221205150050.47784-2-hsiangkao@linux.alibaba.com --- fs/erofs/zmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 98eff1259de4..0150570c33aa 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -698,6 +698,11 @@ static int z_erofs_do_map_blocks(struct inode *inode, } if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (map->m_llen > map->m_plen) { + DBG_BUGON(1); + err = -EFSCORRUPTED; + goto unmap_out; + } if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; -- cgit v1.2.1