diff options
Diffstat (limited to 'fs')
54 files changed, 1239 insertions, 772 deletions
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ec96285357e0..2044f1e18629 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -122,7 +122,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } else { num_bytes = 0; } - if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + if (qgroup_to_release_ret && + block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { qgroup_to_release = block_rsv->qgroup_rsv_reserved - block_rsv->qgroup_rsv_size; block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dcb510f38dda..dbbae92ac23d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4411,10 +4411,12 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; + struct btrfs_key orig_key; struct btrfs_disk_key found_key; int ret; btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; if (key.offset > 0) { key.offset--; @@ -4431,8 +4433,36 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret <= 0) return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5b1b5e1a63c8..acae82a5f8ee 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3215,23 +3215,34 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) { int ret; const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); - bool clear_free_space_tree = false; + bool rebuild_free_space_tree = false; if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - clear_free_space_tree = true; + rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); - clear_free_space_tree = true; + rebuild_free_space_tree = true; } - if (clear_free_space_tree) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); + if (rebuild_free_space_tree) { + btrfs_info(fs_info, "rebuilding free space tree"); + ret = btrfs_rebuild_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); + "failed to rebuild free space tree: %d", ret); + goto out; + } + } + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, "disabling free space tree"); + ret = btrfs_delete_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to disable free space tree: %d", ret); goto out; } } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6bb9fa961a6a..4fab7da63259 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -47,13 +47,13 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 start, end, i_size; int ret; + spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); if (btrfs_fs_incompat(fs_info, NO_HOLES)) { inode->disk_i_size = i_size; - return; + goto out_unlock; } - spin_lock(&inode->lock); ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, &end, EXTENT_DIRTY); if (!ret && start == 0) @@ -61,6 +61,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz else i_size = 0; inode->disk_i_size = i_size; +out_unlock: spin_unlock(&inode->lock); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6a8f2bd350f4..4cd8e44cba4c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -861,15 +861,16 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - recalculate_thresholds(ctl); - spin_unlock(&ctl->tree_lock); if (ret) { + spin_unlock(&ctl->tree_lock); btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } + ctl->total_bitmaps++; + recalculate_thresholds(ctl); + spin_unlock(&ctl->tree_lock); list_add_tail(&e->list, &bitmaps); } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 367bcfcf68f5..e040eea3937d 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1247,7 +1247,7 @@ out: return ret; } -int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) +int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; @@ -1293,6 +1293,54 @@ abort: return ret; } +int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(free_space_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + node = rb_first_cached(&fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *block_group; + + block_group = rb_entry(node, struct btrfs_block_group, + cache_node); + ret = populate_free_space_tree(trans, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); + clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + + ret = btrfs_commit_transaction(trans); + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + return ret; +abort: + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; +} + static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index dc2463e4cfe3..6d5551d0ced8 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -18,7 +18,8 @@ struct btrfs_caching_control; void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); -int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info); int load_free_space_tree(struct btrfs_caching_control *caching_ctl); int add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0e516aefbf51..56e9efbffd58 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3237,6 +3237,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_rewrite_logical_zoned(ordered_extent); btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); + } else if (btrfs_is_data_reloc_root(inode->root)) { + btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } btrfs_free_io_failure_record(inode, start, end); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0cebc203c4cc..9de647e48e7e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -443,7 +443,9 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, case BTRFS_EXCLOP_BALANCE_PAUSED: spin_lock(&fs_info->super_lock); ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || - fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD); + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || + fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || + fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; spin_unlock(&fs_info->super_lock); break; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index dd8777872143..228eeb04d03d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -148,10 +148,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) pr_cont("shared data backref parent %llu count %u\n", offset, btrfs_shared_data_ref_count(eb, sref)); /* - * offset is supposed to be a tree block which - * must be aligned to nodesize. + * Offset is supposed to be a tree block which must be + * aligned to sectorsize. */ - if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) + if (!IS_ALIGNED(offset, eb->fs_info->sectorsize)) pr_info( "\t\t\t(parent %llu not aligned to sectorsize %u)\n", offset, eb->fs_info->sectorsize); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e43b16199e22..6438300fa246 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1136,7 +1136,11 @@ out: !btrfs_test_opt(info, CLEAR_CACHE)) { btrfs_err(info, "cannot disable free space tree"); ret = -EINVAL; - + } + if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && + !btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_err(info, "cannot disable free space tree with block-group-tree feature"); + ret = -EINVAL; } if (!ret) ret = btrfs_check_mountopts_zoned(info); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index e97c5a1ac95d..836babd23db5 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -119,10 +119,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, int i; for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { - u64 bytenr; - - bytenr = ((zones[i].start + zones[i].len) - << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; + u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; + u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - + BTRFS_SUPER_INFO_SIZE; page[i] = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); @@ -1163,12 +1162,12 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) return -ERANGE; /* All the zones are conventional */ - if (find_next_bit(zinfo->seq_zones, begin, end) == end) + if (find_next_bit(zinfo->seq_zones, end, begin) == end) return 0; /* All the zones are sequential and empty */ - if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && - find_next_zero_bit(zinfo->empty_zones, begin, end) == end) + if (find_next_zero_bit(zinfo->seq_zones, end, begin) == end && + find_next_zero_bit(zinfo->empty_zones, end, begin) == end) return 0; for (pos = start; pos < start + size; pos += zinfo->zone_size) { @@ -1605,11 +1604,11 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, !list_empty(&eb->release_list)) return; + memzero_extent_buffer(eb, 0, eb->len); + set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bits_nowait(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, EXTENT_DIRTY); - memzero_extent_buffer(eb, 0, eb->len); - set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); spin_lock(&trans->releasing_ebs_lock); list_add_tail(&eb->release_list, &trans->releasing_ebs); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 03e3e95cf25b..078df1e2dd18 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -743,6 +743,7 @@ static void cifs_umount_begin(struct super_block *sb) spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); + cifs_close_all_deferred_files(tcon); /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ /* cancel_notify_requests(tcon); */ if (tcon->ses && tcon->ses->server) { @@ -758,6 +759,20 @@ static void cifs_umount_begin(struct super_block *sb) return; } +static int cifs_freeze(struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon; + + if (cifs_sb == NULL) + return 0; + + tcon = cifs_sb_master_tcon(cifs_sb); + + cifs_close_all_deferred_files(tcon); + return 0; +} + #ifdef CONFIG_CIFS_STATS2 static int cifs_show_stats(struct seq_file *s, struct dentry *root) { @@ -796,6 +811,7 @@ static const struct super_operations cifs_super_ops = { as opens */ .show_options = cifs_show_options, .umount_begin = cifs_umount_begin, + .freeze_fs = cifs_freeze, #ifdef CONFIG_CIFS_STATS2 .show_stats = cifs_show_stats, #endif diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 21b31d1640e5..935fe198a4ba 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2742,6 +2742,13 @@ cifs_match_super(struct super_block *sb, void *data) spin_lock(&cifs_tcp_ses_lock); cifs_sb = CIFS_SB(sb); + + /* We do not want to use a superblock that has been shutdown */ + if (CIFS_MOUNT_SHUTDOWN & cifs_sb->mnt_cifs_flags) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); if (tlink == NULL) { /* can not match superblock if tlink were ever null */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index ccf311750927..7468f8baf499 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1682,7 +1682,7 @@ smb2_copychunk_range(const unsigned int xid, pcchunk->SourceOffset = cpu_to_le64(src_off); pcchunk->TargetOffset = cpu_to_le64(dest_off); pcchunk->Length = - cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); + cpu_to_le32(min_t(u64, len, tcon->max_bytes_chunk)); /* Request server copy to target from src identified by key */ kfree(retbuf); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8ff4b9192a9f..f2c415f31b75 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -303,6 +303,22 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, return desc; } +static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t next_zero_bit; + unsigned long bitmap_size = sb->s_blocksize * 8; + unsigned int offset = num_clusters_in_group(sb, block_group); + + if (bitmap_size <= offset) + return 0; + + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); + + return (next_zero_bit < bitmap_size ? next_zero_bit : 0); +} + /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. @@ -401,6 +417,15 @@ static int ext4_validate_block_bitmap(struct super_block *sb, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } + blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", + block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 7ada374ff27d..44e83521bfde 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -269,14 +269,12 @@ static void __es_find_extent_range(struct inode *inode, /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u) %llu %x\n", - lblk, es1->es_lblk, es1->es_len, - ext4_es_pblock(es1), ext4_es_status(es1)); - goto out; - } + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %x\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); + goto out; } es1 = __es_tree_search(&tree->root, lblk); @@ -295,7 +293,7 @@ out: } if (es1 && matching_fn(es1)) { - tree->cache_es = es1; + WRITE_ONCE(tree->cache_es, es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; @@ -933,14 +931,12 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, /* find extent in cache firstly */ es->es_lblk = es->es_len = es->es_pblk = 0; - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u)\n", - lblk, es1->es_lblk, es1->es_len); - found = 1; - goto out; - } + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; } node = tree->root.rb_node; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 147b5241dd94..46c3423ddfa1 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -277,7 +277,11 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, } default: hinfo->hash = 0; - return -1; + hinfo->minor_hash = 0; + ext4_warning(dir->i_sb, + "invalid/unsupported hash tree version %u", + hinfo->hash_version); + return -EINVAL; } hash = hash & ~1; if (hash == (EXT4_HTREE_EOF_32BIT << 1)) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c4475a74c762..3a91be1d9bbe 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -34,6 +34,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; + void *end; int free, min_offs; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) @@ -57,14 +58,23 @@ static int get_max_inline_xattr_value_size(struct inode *inode, raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; /* Compute min_offs. */ - for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + while (!IS_LAST_ENTRY(entry)) { + void *next = EXT4_XATTR_NEXT(entry); + + if (next >= end) { + EXT4_ERROR_INODE(inode, + "corrupt xattr in inline inode"); + return 0; + } if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; } + entry = next; } free = min_offs - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); @@ -351,7 +361,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); - if (error == -ENODATA) + if (error < 0) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); @@ -1178,6 +1188,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, ext4_initialize_dirent_tail(dir_block, inode->i_sb->s_blocksize); set_buffer_uptodate(dir_block); + unlock_buffer(dir_block); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) return err; @@ -1252,6 +1263,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, if (!S_ISDIR(inode->i_mode)) { memcpy(data_bh->b_data, buf, inline_size); set_buffer_uptodate(data_bh); + unlock_buffer(data_bh); error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { @@ -1259,7 +1271,6 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, buf, inline_size); } - unlock_buffer(data_bh); out_restore: if (error) ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 42003b5c4cad..ffc810436ef2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3503,7 +3503,7 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); - WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9dad93059945..912c4a1093fe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4820,7 +4820,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { + ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", + e4b->bd_group, group, pa->pa_pstart); + return 0; + } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 588cb09c5291..23930ed3cbda 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -39,28 +39,36 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) * Write the MMP block using REQ_SYNC to try to get the block on-disk * faster. */ -static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) +static int write_mmp_block_thawed(struct super_block *sb, + struct buffer_head *bh) { struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); - /* - * We protect against freezing so that we don't create dirty buffers - * on frozen filesystem. - */ - sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); - sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) return -EIO; - return 0; } +static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) +{ + int err; + + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); + err = write_mmp_block_thawed(sb, bh); + sb_end_write(sb); + return err; +} + /* * Read the MMP block. It _must_ be read from disk and hence we clear the * uptodate flag on the buffer. @@ -346,7 +354,11 @@ skip: seq = mmp_new_seq(); mmp->mmp_seq = cpu_to_le32(seq); - retval = write_mmp_block(sb, bh); + /* + * On mount / remount we are protected against fs freezing (by s_umount + * semaphore) and grabbing freeze protection upsets lockdep + */ + retval = write_mmp_block_thawed(sb, bh); if (retval) goto failed; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 56f09598448b..5a3dbbabe23a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -674,7 +674,7 @@ static struct stats dx_show_leaf(struct inode *dir, len = de->name_len; if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ - ext4fs_dirhash(dir, de->name, + (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(U)%x.%u ", len, name, h.hash, @@ -709,8 +709,9 @@ static struct stats dx_show_leaf(struct inode *dir, if (IS_CASEFOLDED(dir)) h.hash = EXT4_DIRENT_HASH(de); else - ext4fs_dirhash(dir, de->name, - de->name_len, &h); + (void) ext4fs_dirhash(dir, + de->name, + de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); @@ -720,7 +721,8 @@ static struct stats dx_show_leaf(struct inode *dir, #else int len = de->name_len; char *name = de->name; - ext4fs_dirhash(dir, de->name, de->name_len, &h); + (void) ext4fs_dirhash(dir, de->name, + de->name_len, &h); printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif @@ -849,8 +851,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* hash is already computed for encrypted casefolded directory */ if (fname && fname_name(fname) && - !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) - ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { + int ret = ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), hinfo); + if (ret < 0) { + ret_err = ERR_PTR(ret); + goto fail; + } + } hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -1111,7 +1119,12 @@ static int htree_dirblock_to_tree(struct file *dir_file, hinfo->minor_hash = 0; } } else { - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + err = ext4fs_dirhash(dir, de->name, + de->name_len, hinfo); + if (err < 0) { + count = err; + goto errout; + } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && @@ -1313,8 +1326,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); - else - ext4fs_dirhash(dir, de->name, de->name_len, &h); + else { + int err = ext4fs_dirhash(dir, de->name, + de->name_len, &h); + if (err < 0) + return err; + } map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1452,10 +1469,9 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, hinfo->hash_version = DX_HASH_SIPHASH; hinfo->seed = NULL; if (cf_name->name) - ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); + return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); else - ext4fs_dirhash(dir, iname->name, iname->len, hinfo); - return 0; + return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); } #endif @@ -2298,10 +2314,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* casefolded encrypted hashes are computed on fname setup */ - if (!ext4_hash_in_dirent(dir)) - ext4fs_dirhash(dir, fname_name(fname), - fname_len(fname), &fname->hinfo); - + if (!ext4_hash_in_dirent(dir)) { + int err = ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), &fname->hinfo); + if (err < 0) { + brelse(bh2); + brelse(bh); + return err; + } + } memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2528e8216c33..d542f068ca99 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3195,11 +3195,9 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ - if (ext4_has_feature_64bit(sb) && - offset < le16_to_cpu(sbi->s_es->s_desc_size)) + if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, - le16_to_cpu(sbi->s_es->s_desc_size) - - offset); + sbi->s_desc_size - offset); out: return cpu_to_le16(crc); @@ -6568,9 +6566,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } #ifdef CONFIG_QUOTA - /* Release old quota file names */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(old_opts.s_qf_names[i]); if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); @@ -6580,6 +6575,9 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } } + /* Release old quota file names */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(old_opts.s_qf_names[i]); #endif if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); @@ -6590,6 +6588,13 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) return 0; restore_opts: + /* + * If there was a failing r/w to ro transition, we may need to + * re-enable quota + */ + if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b17c1b90e122..b1b8fe86ccdb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2564,6 +2564,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + int needs_kvfree = 0; int error; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); @@ -2586,7 +2587,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, error = -ENOMEM; goto out; } - + needs_kvfree = 1; error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; @@ -2625,7 +2626,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, out: kfree(b_entry_name); - if (entry->e_value_inum && buffer) + if (needs_kvfree && buffer) kvfree(buffer); if (is) brelse(is->iloc.bh); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 770a606eb3f6..de6b056f090b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1134,7 +1134,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; f2fs_set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); + f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ @@ -1203,7 +1203,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) struct extent_info ei = {0, }; struct inode *inode = dn->inode; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn->data_blkaddr = ei.blk + index - ei.fofs; return 0; } @@ -1224,7 +1224,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!page) return ERR_PTR(-ENOMEM); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { @@ -1486,7 +1486,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; - if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1696,7 +1696,7 @@ skip: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -1741,7 +1741,7 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -2202,7 +2202,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; - if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) @@ -2636,7 +2636,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && - f2fs_lookup_extent_cache(inode, page->index, &ei)) { + f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, @@ -3361,7 +3361,7 @@ restart: } else if (locked) { err = f2fs_get_block(&dn, index); } else { - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ @@ -3402,7 +3402,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, ipage, ipage, 0); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a216dcdf6941..a9baa121d829 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -72,15 +72,23 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); - /* validation check of the segment numbers */ + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]); + si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]); + si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]); + si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i]; + si->ext_tree[i] = atomic_read(&eti->total_ext_tree); + si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree); + si->ext_node[i] = atomic_read(&eti->total_ext_node); + } + /* read extent_cache only */ si->hit_largest = atomic64_read(&sbi->read_hit_largest); - si->hit_cached = atomic64_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); - si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = atomic_read(&sbi->total_ext_tree); - si->zombie_tree = atomic_read(&sbi->total_zombie_tree); - si->ext_node = atomic_read(&sbi->total_ext_node); + si->hit_total[EX_READ] += si->hit_largest; + + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); @@ -294,10 +302,16 @@ get_cache: sizeof(struct nat_entry_set); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += atomic_read(&sbi->total_ext_tree) * + + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->ext_mem[i] = atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree); - si->cache_mem += atomic_read(&sbi->total_ext_node) * + si->ext_mem[i] += atomic_read(&eti->total_ext_node) * sizeof(struct extent_node); + si->cache_mem += si->ext_mem[i]; + } si->page_mem = 0; if (sbi->node_inode) { @@ -490,16 +504,18 @@ static int stat_show(struct seq_file *s, void *v) si->bg_node_blks); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); - seq_puts(s, "\nExtent Cache:\n"); + seq_puts(s, "\nExtent Cache (Read):\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", - si->hit_largest, si->hit_cached, - si->hit_rbtree); + si->hit_largest, si->hit_cached[EX_READ], + si->hit_rbtree[EX_READ]); seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", - !si->total_ext ? 0 : - div64_u64(si->hit_total * 100, si->total_ext), - si->hit_total, si->total_ext); + !si->total_ext[EX_READ] ? 0 : + div64_u64(si->hit_total[EX_READ] * 100, + si->total_ext[EX_READ]), + si->hit_total[EX_READ], si->total_ext[EX_READ]); seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", - si->ext_tree, si->zombie_tree, si->ext_node); + si->ext_tree[EX_READ], si->zombie_tree[EX_READ], + si->ext_node[EX_READ]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); @@ -566,8 +582,10 @@ static int stat_show(struct seq_file *s, void *v) (si->base_mem + si->cache_mem + si->page_mem) >> 10); seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %llu KB\n", + seq_printf(s, " - cached all: %llu KB\n", si->cache_mem >> 10); + seq_printf(s, " - read extent cache: %llu KB\n", + si->ext_mem[EX_READ] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } @@ -600,10 +618,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic64_set(&sbi->total_hit_ext, 0); - atomic64_set(&sbi->read_hit_rbtree, 0); + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + atomic64_set(&sbi->total_hit_ext[i], 0); + atomic64_set(&sbi->read_hit_rbtree[i], 0); + atomic64_set(&sbi->read_hit_cached[i], 0); + } + + /* read extent_cache only */ atomic64_set(&sbi->read_hit_largest, 0); - atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 6c9e6f78a3e3..16692c96e765 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -15,6 +15,122 @@ #include "node.h" #include <trace/events/f2fs.h> +bool sanity_check_extent_cache(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct extent_info *ei; + + if (!fi->extent_tree[EX_READ]) + return true; + + ei = &fi->extent_tree[EX_READ]->largest; + + if (ei->len && + (!f2fs_is_valid_blkaddr(sbi, ei->blk, + DATA_GENERIC_ENHANCE) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC_ENHANCE))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei->blk, ei->fofs, ei->len); + return false; + } + return true; +} + +static void __set_extent_info(struct extent_info *ei, + unsigned int fofs, unsigned int len, + block_t blk, bool keep_clen, + enum extent_type type) +{ + ei->fofs = fofs; + ei->len = len; + + if (type == EX_READ) { + ei->blk = blk; + if (keep_clen) + return; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif + } +} + +static bool __may_read_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return false; + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi)) + return false; + return S_ISREG(inode->i_mode); +} + +static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) +{ + if (type == EX_READ) + return __may_read_extent_tree(inode); + return false; +} + +static bool __may_extent_tree(struct inode *inode, enum extent_type type) +{ + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&F2FS_I_SB(inode)->s_list)) + return false; + + return __init_may_extent_tree(inode, type); +} + +static void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (et->type != EX_READ) + return; + if (en->ei.len <= et->largest.len) + return; + + et->largest = en->ei; + et->largest_updated = true; +} + +static bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front, enum extent_type type) +{ + if (type == EX_READ) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); + } + return false; +} + +static bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back, enum extent_type type) +{ + return __is_extent_mergeable(back, cur, type); +} + +static bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front, enum extent_type type) +{ + return __is_extent_mergeable(cur, front, type); +} + static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, unsigned int ofs) { @@ -58,29 +174,6 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, return re; } -struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned long long key, bool *leftmost) -{ - struct rb_node **p = &root->rb_root.rb_node; - struct rb_entry *re; - - while (*p) { - *parent = *p; - re = rb_entry(*parent, struct rb_entry, rb_node); - - if (key < re->key) { - p = &(*p)->rb_left; - } else { - p = &(*p)->rb_right; - *leftmost = false; - } - } - - return p; -} - struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -189,7 +282,7 @@ lookup_neighbors: } bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, bool check_key) + struct rb_root_cached *root) { #ifdef CONFIG_F2FS_CHECK_FS struct rb_node *cur = rb_first_cached(root), *next; @@ -206,23 +299,12 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, cur_re = rb_entry(cur, struct rb_entry, rb_node); next_re = rb_entry(next, struct rb_entry, rb_node); - if (check_key) { - if (cur_re->key > next_re->key) { - f2fs_info(sbi, "inconsistent rbtree, " - "cur(%llu) next(%llu)", - cur_re->key, next_re->key); - return false; - } - goto next; - } - if (cur_re->ofs + cur_re->len > next_re->ofs) { f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", cur_re->ofs, cur_re->len, next_re->ofs, next_re->len); return false; } -next: cur = next; } #endif @@ -237,6 +319,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct rb_node *parent, struct rb_node **p, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en; en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); @@ -250,16 +333,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color_cached(&en->rb_node, &et->root, leftmost); atomic_inc(&et->node_cnt); - atomic_inc(&sbi->total_ext_node); + atomic_inc(&eti->total_ext_node); return en; } static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + rb_erase_cached(&en->rb_node, &et->root); atomic_dec(&et->node_cnt); - atomic_dec(&sbi->total_ext_node); + atomic_dec(&eti->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; @@ -275,61 +360,51 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, static void __release_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { - spin_lock(&sbi->extent_lock); + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + spin_lock(&eti->extent_lock); f2fs_bug_on(sbi, list_empty(&en->list)); list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); } -static struct extent_tree *__grab_extent_tree(struct inode *inode) +static struct extent_tree *__grab_extent_tree(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et; nid_t ino = inode->i_ino; - mutex_lock(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); + mutex_lock(&eti->extent_tree_lock); + et = radix_tree_lookup(&eti->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS, true, NULL); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; + et->type = type; et->root = RB_ROOT_CACHED; et->cached_en = NULL; rwlock_init(&et->lock); INIT_LIST_HEAD(&et->list); atomic_set(&et->node_cnt, 0); - atomic_inc(&sbi->total_ext_tree); + atomic_inc(&eti->total_ext_tree); } else { - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_zombie_tree); list_del_init(&et->list); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); /* never died until evict_inode */ - F2FS_I(inode)->extent_tree = et; + F2FS_I(inode)->extent_tree[type] = et; return et; } -static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei) -{ - struct rb_node **p = &et->root.rb_root.rb_node; - struct extent_node *en; - - en = __attach_extent_node(sbi, et, ei, NULL, p, true); - if (!en) - return NULL; - - et->largest = en->ei; - et->cached_en = en; - return en; -} - static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et) { @@ -358,71 +433,78 @@ static void __drop_largest_extent(struct extent_tree *et, } } -/* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) { - /* drop largest extent */ + if (!__may_extent_tree(inode, EX_READ)) { + /* drop largest read extent */ if (i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); - return; } - return; + goto out; } - et = __grab_extent_tree(inode); + et = __grab_extent_tree(inode, EX_READ); if (!i_ext || !i_ext->len) - return; + goto out; - get_extent_info(&ei, i_ext); + get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) - goto out; + goto unlock_out; - en = __init_extent_tree(sbi, et, &ei); + en = __attach_extent_node(sbi, et, &ei, NULL, + &et->root.rb_root.rb_node, true); if (en) { - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); + et->largest = en->ei; + et->cached_en = en; + + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + spin_unlock(&eti->extent_lock); } -out: +unlock_out: write_unlock(&et->lock); +out: + if (!F2FS_I(inode)->extent_tree[EX_READ]) + set_inode_flag(inode, FI_NO_EXTENT); } -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_extent_tree(struct inode *inode) { - __f2fs_init_extent_tree(inode, ipage); - - if (!F2FS_I(inode)->extent_tree) - set_inode_flag(inode, FI_NO_EXTENT); + /* initialize read cache */ + if (__init_may_extent_tree(inode, EX_READ)) + __grab_extent_tree(inode, EX_READ); } -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) +static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en; bool ret = false; if (!et) return false; - trace_f2fs_lookup_extent_tree_start(inode, pgofs); + trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); read_lock(&et->lock); - if (et->largest.fofs <= pgofs && + if (type == EX_READ && + et->largest.fofs <= pgofs && et->largest.fofs + et->largest.len > pgofs) { *ei = et->largest; ret = true; @@ -436,23 +518,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; if (en == et->cached_en) - stat_inc_cached_node_hit(sbi); + stat_inc_cached_node_hit(sbi, type); else - stat_inc_rbtree_node_hit(sbi); + stat_inc_rbtree_node_hit(sbi, type); *ei = en->ei; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); ret = true; out: - stat_inc_total_hit(sbi); + stat_inc_total_hit(sbi, type); read_unlock(&et->lock); - trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + if (type == EX_READ) + trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); return ret; } @@ -461,18 +544,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en = NULL; - if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) { prev_ex->ei.len += ei->len; ei = &prev_ex->ei; en = prev_ex; } - if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) { next_ex->ei.fofs = ei->fofs; - next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (et->type == EX_READ) + next_ex->ei.blk = ei->blk; if (en) __release_extent_node(sbi, et, prev_ex); @@ -484,12 +569,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, __try_update_largest_extent(et, en); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } @@ -499,6 +584,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -521,47 +607,50 @@ do_insert: __try_update_largest_extent(et, en); /* update in global extent list */ - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); et->cached_en = en; - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } -static void f2fs_update_extent_tree_range(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int len) +static void __update_extent_tree_range(struct inode *inode, + struct extent_info *tei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int fofs = tei->fofs, len = tei->len; unsigned int end = fofs + len; - unsigned int pos = (unsigned int)fofs; bool updated = false; bool leftmost = false; if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); - + if (type == EX_READ) + trace_f2fs_update_read_extent_tree_range(inode, fofs, len, + tei->blk, 0); write_lock(&et->lock); - if (is_inode_flag_set(inode, FI_NO_EXTENT)) { - write_unlock(&et->lock); - return; - } + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } - prev = et->largest; - dei.len = 0; + prev = et->largest; + dei.len = 0; - /* - * drop largest extent before lookup, in case it's already - * been shrunk from extent tree - */ - __drop_largest_extent(et, fofs, len); + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); + } /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, @@ -582,26 +671,30 @@ static void f2fs_update_extent_tree_range(struct inode *inode, dei = en->ei; org_end = dei.fofs + dei.len; - f2fs_bug_on(sbi, pos >= org_end); + f2fs_bug_on(sbi, fofs >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - en->ei.len = pos - en->ei.fofs; + if (fofs > dei.fofs && (type != EX_READ || + fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) { + en->ei.len = fofs - en->ei.fofs; prev_en = en; parts = 1; } - if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (end < org_end && (type != EX_READ || + org_end - end >= F2FS_MIN_EXTENT_LEN)) { if (parts) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - org_end - end); + __set_extent_info(&ei, + end, org_end - end, + end - dei.fofs + dei.blk, false, + type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; + __set_extent_info(&en->ei, + end, en->ei.len - (end - dei.fofs), + en->ei.blk + (end - dei.fofs), true, + type); next_en = en; } parts++; @@ -631,10 +724,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode, en = next_en; } - /* 3. update extent in extent cache */ - if (blkaddr) { + /* 3. update extent in read extent cache */ + BUG_ON(type != EX_READ); - set_extent_info(&ei, fofs, blkaddr, len); + if (tei->blk) { + __set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -664,19 +758,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode, } #ifdef CONFIG_F2FS_FS_COMPRESSION -void f2fs_update_extent_tree_range_compressed(struct inode *inode, +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; struct extent_node *en = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei; struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); + trace_f2fs_update_read_extent_tree_range(inode, fofs, llen, + blkaddr, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) @@ -693,7 +788,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - set_extent_info(&ei, fofs, blkaddr, llen); + __set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -704,24 +799,43 @@ unlock_out: } #endif -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { + struct extent_info ei; + + if (!__may_extent_tree(dn->inode, type)) + return; + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + ei.len = 1; + + if (type == EX_READ) { + if (dn->data_blkaddr == NEW_ADDR) + ei.blk = NULL_ADDR; + else + ei.blk = dn->data_blkaddr; + } + __update_extent_tree_range(dn->inode, &ei, type); +} + +static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et, *next; struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, EXTENT_CACHE)) - return 0; - - if (!atomic_read(&sbi->total_zombie_tree)) + if (!atomic_read(&eti->total_zombie_tree)) goto free_node; - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); node_cnt += __free_extent_tree(sbi, et); @@ -729,61 +843,100 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); list_del_init(&et->list); - radix_tree_delete(&sbi->extent_tree_root, et->ino); + radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_ext_tree); + atomic_dec(&eti->total_zombie_tree); tree_cnt++; if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; cond_resched(); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); for (; remained > 0; remained--) { - if (list_empty(&sbi->extent_list)) + if (list_empty(&eti->extent_list)) break; - en = list_first_entry(&sbi->extent_list, + en = list_first_entry(&eti->extent_list, struct extent_node, list); et = en->et; if (!write_trylock(&et->lock)) { /* refresh this extent node's position in extent list */ - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); continue; } list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); write_unlock(&et->lock); node_cnt++; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); unlock_out: - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); out: - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type); return node_cnt + tree_cnt; } -unsigned int f2fs_destroy_extent_node(struct inode *inode) +/* read extent cache operations */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_READ)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_READ); +} + +void f2fs_update_read_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_READ); +} + +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .blk = blkaddr, + }; + + if (!__may_extent_tree(dn->inode, EX_READ)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_READ); +} + +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_READ); +} + +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et || !atomic_read(&et->node_cnt)) @@ -796,31 +949,44 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) return node_cnt; } -void f2fs_drop_extent_tree(struct inode *inode) +void f2fs_destroy_extent_node(struct inode *inode) +{ + __destroy_extent_node(inode, EX_READ); +} + +static void __drop_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; - if (!f2fs_may_extent_tree(inode)) + if (!__may_extent_tree(inode, type)) return; write_lock(&et->lock); - set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); - if (et->largest.len) { - et->largest.len = 0; - updated = true; + if (type == EX_READ) { + set_inode_flag(inode, FI_NO_EXTENT); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } } write_unlock(&et->lock); if (updated) f2fs_mark_inode_dirty_sync(inode, true); } -void f2fs_destroy_extent_tree(struct inode *inode) +void f2fs_drop_extent_tree(struct inode *inode) +{ + __drop_extent_tree(inode, EX_READ); +} + +static void __destroy_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et) @@ -828,76 +994,49 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - mutex_lock(&sbi->extent_tree_lock); - list_add_tail(&et->list, &sbi->zombie_list); - atomic_inc(&sbi->total_zombie_tree); - mutex_unlock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); + list_add_tail(&et->list, &eti->zombie_list); + atomic_inc(&eti->total_zombie_tree); + mutex_unlock(&eti->extent_tree_lock); return; } /* free all extent info belong to this extent tree */ - node_cnt = f2fs_destroy_extent_node(inode); + node_cnt = __destroy_extent_node(inode, type); /* delete extent tree entry in radix tree */ - mutex_lock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + radix_tree_delete(&eti->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - mutex_unlock(&sbi->extent_tree_lock); + atomic_dec(&eti->total_ext_tree); + mutex_unlock(&eti->extent_tree_lock); - F2FS_I(inode)->extent_tree = NULL; - - trace_f2fs_destroy_extent_tree(inode, node_cnt); -} - -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - if (!f2fs_may_extent_tree(inode)) - return false; + F2FS_I(inode)->extent_tree[type] = NULL; - return f2fs_lookup_extent_tree(inode, pgofs, ei); + trace_f2fs_destroy_extent_tree(inode, node_cnt, type); } -void f2fs_update_extent_cache(struct dnode_of_data *dn) +void f2fs_destroy_extent_tree(struct inode *inode) { - pgoff_t fofs; - block_t blkaddr; - - if (!f2fs_may_extent_tree(dn->inode)) - return; - - if (dn->data_blkaddr == NEW_ADDR) - blkaddr = NULL_ADDR; - else - blkaddr = dn->data_blkaddr; - - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); + __destroy_extent_tree(inode, EX_READ); } -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len) - +static void __init_extent_tree_info(struct extent_tree_info *eti) { - if (!f2fs_may_extent_tree(dn->inode)) - return; - - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); + INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO); + mutex_init(&eti->extent_tree_lock); + INIT_LIST_HEAD(&eti->extent_list); + spin_lock_init(&eti->extent_lock); + atomic_set(&eti->total_ext_tree, 0); + INIT_LIST_HEAD(&eti->zombie_list); + atomic_set(&eti->total_zombie_tree, 0); + atomic_set(&eti->total_ext_node, 0); } void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - mutex_init(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - atomic_set(&sbi->total_ext_tree, 0); - INIT_LIST_HEAD(&sbi->zombie_list); - atomic_set(&sbi->total_zombie_tree, 0); - atomic_set(&sbi->total_ext_node, 0); + __init_extent_tree_info(&sbi->extent_tree[EX_READ]); } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b44ca1decdd..a0a232551da9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -91,7 +91,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_FLUSH_MERGE 0x00000400 #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 -#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_USRQUOTA 0x00080000 @@ -593,35 +593,43 @@ enum { /* dirty segments threshold for triggering CP */ #define DEFAULT_DIRTY_THRESHOLD 4 +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ -#define EXTENT_CACHE_SHRINK_NUMBER 128 +#define READ_EXTENT_CACHE_SHRINK_NUMBER 128 -#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS -#define RECOVERY_MIN_RA_BLOCKS 1 - -#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ +/* extent cache type */ +enum extent_type { + EX_READ, + NR_EXTENT_CACHES, +}; struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - union { - struct { - unsigned int ofs; /* start offset of the entry */ - unsigned int len; /* length of the entry */ - }; - unsigned long long key; /* 64-bits key */ - } __packed; + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ }; struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - u32 blk; /* start block address of the extent */ + union { + /* read extent_cache */ + struct { + /* start block address of the extent */ + block_t blk; #ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int c_len; /* physical extent length of compressed blocks */ + /* physical extent length of compressed blocks */ + unsigned int c_len; #endif + }; + }; }; struct extent_node { @@ -633,13 +641,25 @@ struct extent_node { struct extent_tree { nid_t ino; /* inode number */ + enum extent_type type; /* keep the extent tree type */ struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ - struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ bool largest_updated; /* largest extent updated */ + struct extent_info largest; /* largest cached extent for EX_READ */ +}; + +struct extent_tree_info { + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ }; /* @@ -801,7 +821,8 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct task_struct *atomic_write_task; /* store atomic write task */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct extent_tree *extent_tree[NR_EXTENT_CACHES]; + /* cached extent_tree entry */ struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ @@ -826,7 +847,7 @@ struct f2fs_inode_info { loff_t original_i_size; /* original i_size before atomic write */ }; -static inline void get_extent_info(struct extent_info *ext, +static inline void get_read_extent_info(struct extent_info *ext, struct f2fs_extent *i_ext) { ext->fofs = le32_to_cpu(i_ext->fofs); @@ -834,7 +855,7 @@ static inline void get_extent_info(struct extent_info *ext, ext->len = le32_to_cpu(i_ext->len); } -static inline void set_raw_extent(struct extent_info *ext, +static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { i_ext->fofs = cpu_to_le32(ext->fofs); @@ -842,17 +863,6 @@ static inline void set_raw_extent(struct extent_info *ext, i_ext->len = cpu_to_le32(ext->len); } -static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, - u32 blk, unsigned int len) -{ - ei->fofs = fofs; - ei->blk = blk; - ei->len = len; -#ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; -#endif -} - static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front, unsigned int max_len) { @@ -872,41 +882,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, return __is_discard_mergeable(cur, front, max_len); } -static inline bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) -{ -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; -#endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); -} - -static inline bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) -{ - return __is_extent_mergeable(back, cur); -} - -static inline bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) -{ - return __is_extent_mergeable(cur, front); -} - -extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) -{ - if (en->ei.len > et->largest.len) { - et->largest = en->ei; - et->largest_updated = true; - } -} - /* * For free nid management */ @@ -1670,14 +1645,7 @@ struct f2fs_sb_info { struct mutex flush_lock; /* for flush exclusion */ /* for extent tree cache */ - struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct mutex extent_tree_lock; /* locking extent radix tree */ - struct list_head extent_list; /* lru list for shrinker */ - spinlock_t extent_lock; /* locking extent lru list */ - atomic_t total_ext_tree; /* extent tree count */ - struct list_head zombie_list; /* extent zombie tree list */ - atomic_t total_zombie_tree; /* extent zombie tree count */ - atomic_t total_ext_node; /* extent info count */ + struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -1761,10 +1729,14 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic64_t total_hit_ext; /* # of lookup extent cache */ - atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic64_t read_hit_largest; /* # of hit largest extent node */ - atomic64_t read_hit_cached; /* # of hit cached extent node */ + /* # of lookup extent cache */ + atomic64_t total_hit_ext[NR_EXTENT_CACHES]; + /* # of hit rbtree extent node */ + atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; + /* # of hit cached extent node */ + atomic64_t read_hit_cached[NR_EXTENT_CACHES]; + /* # of hit largest extent node in read extent cache */ + atomic64_t read_hit_largest; atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -2578,6 +2550,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { @@ -3865,9 +3838,17 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - unsigned long long hit_largest, hit_cached, hit_rbtree; - unsigned long long hit_total, total_ext; - int ext_tree, zombie_tree, ext_node; + unsigned long long hit_cached[NR_EXTENT_CACHES]; + unsigned long long hit_rbtree[NR_EXTENT_CACHES]; + unsigned long long total_ext[NR_EXTENT_CACHES]; + unsigned long long hit_total[NR_EXTENT_CACHES]; + int ext_tree[NR_EXTENT_CACHES]; + int zombie_tree[NR_EXTENT_CACHES]; + int ext_node[NR_EXTENT_CACHES]; + /* to count memory footprint */ + unsigned long long ext_mem[NR_EXTENT_CACHES]; + /* for read extent cache */ + unsigned long long hit_largest; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; @@ -3926,10 +3907,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) -#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) +#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) +#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -4052,10 +4033,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sbi) do { } while (0) -#define stat_inc_rbtree_node_hit(sbi) do { } while (0) +#define stat_inc_total_hit(sbi, type) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) -#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi, type) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) #define stat_dec_inline_xattr(inode) do { } while (0) #define stat_inc_inline_inode(inode) do { } while (0) @@ -4144,12 +4125,9 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ +bool sanity_check_extent_cache(struct inode *inode); struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); -struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned long long key, bool *left_most); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -4160,21 +4138,25 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, bool check_key); -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); + struct rb_root_cached *root); +void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); -unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei); -void f2fs_update_extent_cache(struct dnode_of_data *dn); -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len); void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); +/* read extent cache ops */ +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_read_extent_cache(struct dnode_of_data *dn); +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ @@ -4244,9 +4226,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); -void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len); +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); @@ -4327,9 +4309,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) -static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len) { } +static inline void f2fs_update_read_extent_tree_range_compressed( + struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len) { } #endif static inline int set_compress_context(struct inode *inode) @@ -4400,26 +4383,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bf37983304a3..dbad2db68f1b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -618,7 +618,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) */ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; - f2fs_update_extent_cache_range(dn, fofs, 0, len); + f2fs_update_read_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; @@ -1496,7 +1496,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, f2fs_set_data_blkaddr(dn); } - f2fs_update_extent_cache_range(dn, start, 0, index - start); + f2fs_update_read_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -2573,7 +2573,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; @@ -2605,7 +2605,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, * lookup mapping info in extent cache, skip defragmenting if physical * block addresses are continuous. */ - if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) { if (ei.fofs + ei.len >= pg_end) goto out; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index aa928d1c8159..5cd19fdc1059 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -389,40 +389,95 @@ static unsigned int count_bits(const unsigned long *addr, return sum; } -static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, - unsigned long long mtime, unsigned int segno, - struct rb_node *parent, struct rb_node **p, - bool left_most) +static bool f2fs_check_victim_tree(struct f2fs_sb_info *sbi, + struct rb_root_cached *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first_cached(root), *next; + struct victim_entry *cur_ve, *next_ve; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_ve = rb_entry(cur, struct victim_entry, rb_node); + next_ve = rb_entry(next, struct victim_entry, rb_node); + + if (cur_ve->mtime > next_ve->mtime) { + f2fs_info(sbi, "broken victim_rbtree, " + "cur_mtime(%llu) next_mtime(%llu)", + cur_ve->mtime, next_ve->mtime); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct victim_entry *__lookup_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *node = am->root.rb_root.rb_node; + struct victim_entry *ve = NULL; + + while (node) { + ve = rb_entry(node, struct victim_entry, rb_node); + + if (mtime < ve->mtime) + node = node->rb_left; + else + node = node->rb_right; + } + return ve; +} + +static struct victim_entry *__create_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) { struct atgc_management *am = &sbi->am; struct victim_entry *ve; - ve = f2fs_kmem_cache_alloc(victim_entry_slab, - GFP_NOFS, true, NULL); + ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS, true, NULL); ve->mtime = mtime; ve->segno = segno; - rb_link_node(&ve->rb_node, parent, p); - rb_insert_color_cached(&ve->rb_node, &am->root, left_most); - list_add_tail(&ve->list, &am->victim_list); - am->victim_count++; return ve; } -static void insert_victim_entry(struct f2fs_sb_info *sbi, +static void __insert_victim_entry(struct f2fs_sb_info *sbi, unsigned long long mtime, unsigned int segno) { struct atgc_management *am = &sbi->am; - struct rb_node **p; + struct rb_root_cached *root = &am->root; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent = NULL; + struct victim_entry *ve; bool left_most = true; - p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most); - attach_victim_entry(sbi, mtime, segno, parent, p, left_most); + /* look up rb tree to find parent node */ + while (*p) { + parent = *p; + ve = rb_entry(parent, struct victim_entry, rb_node); + + if (mtime < ve->mtime) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + left_most = false; + } + } + + ve = __create_victim_entry(sbi, mtime, segno); + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, root, left_most); } static void add_victim_entry(struct f2fs_sb_info *sbi, @@ -458,19 +513,7 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, if (sit_i->dirty_max_mtime - mtime < p->age_threshold) return; - insert_victim_entry(sbi, mtime, segno); -} - -static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi, - struct victim_sel_policy *p) -{ - struct atgc_management *am = &sbi->am; - struct rb_node *parent = NULL; - bool left_most; - - f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most); - - return parent; + __insert_victim_entry(sbi, mtime, segno); } static void atgc_lookup_victim(struct f2fs_sb_info *sbi, @@ -480,7 +523,6 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, struct atgc_management *am = &sbi->am; struct rb_root_cached *root = &am->root; struct rb_node *node; - struct rb_entry *re; struct victim_entry *ve; unsigned long long total_time; unsigned long long age, u, accu; @@ -507,12 +549,10 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, node = rb_first_cached(root); next: - re = rb_entry_safe(node, struct rb_entry, rb_node); - if (!re) + ve = rb_entry_safe(node, struct victim_entry, rb_node); + if (!ve) return; - ve = (struct victim_entry *)re; - if (ve->mtime >= max_mtime || ve->mtime < min_mtime) goto skip; @@ -554,8 +594,6 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi, { struct sit_info *sit_i = SIT_I(sbi); struct atgc_management *am = &sbi->am; - struct rb_node *node; - struct rb_entry *re; struct victim_entry *ve; unsigned long long age; unsigned long long max_mtime = sit_i->dirty_max_mtime; @@ -565,25 +603,22 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi, unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * am->victim_count / 100); - unsigned int cost; - unsigned int iter = 0; + unsigned int cost, iter; int stage = 0; if (max_mtime < min_mtime) return; max_mtime += 1; next_stage: - node = lookup_central_victim(sbi, p); + iter = 0; + ve = __lookup_victim_entry(sbi, p->age); next_node: - re = rb_entry_safe(node, struct rb_entry, rb_node); - if (!re) { - if (stage == 0) - goto skip_stage; + if (!ve) { + if (stage++ == 0) + goto next_stage; return; } - ve = (struct victim_entry *)re; - if (ve->mtime >= max_mtime || ve->mtime < min_mtime) goto skip_node; @@ -609,24 +644,20 @@ next_node: } skip_node: if (iter < dirty_threshold) { - if (stage == 0) - node = rb_prev(node); - else if (stage == 1) - node = rb_next(node); + ve = rb_entry(stage == 0 ? rb_prev(&ve->rb_node) : + rb_next(&ve->rb_node), + struct victim_entry, rb_node); goto next_node; } -skip_stage: - if (stage < 1) { - stage++; - iter = 0; + + if (stage++ == 0) goto next_stage; - } } + static void lookup_victim_by_age(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &sbi->am.root, true)); + f2fs_bug_on(sbi, !f2fs_check_victim_tree(sbi, &sbi->am.root)); if (p->gc_mode == GC_AT) atgc_lookup_victim(sbi, p); @@ -1147,7 +1178,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1165,7 +1196,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (!page) return -ENOMEM; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 19b956c2d697..ca84024b9c9e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -55,20 +55,10 @@ struct gc_inode_list { struct radix_tree_root iroot; }; -struct victim_info { - unsigned long long mtime; /* mtime of section */ - unsigned int segno; /* section No. */ -}; - struct victim_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - union { - struct { - unsigned long long mtime; /* mtime of section */ - unsigned int segno; /* segment No. */ - }; - struct victim_info vi; /* victim info */ - }; + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ struct list_head list; }; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 229ddc2f7b07..aab3b8b3ab0a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -262,22 +262,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (fi->extent_tree) { - struct extent_info *ei = &fi->extent_tree->largest; - - if (ei->len && - (!f2fs_is_valid_blkaddr(sbi, ei->blk, - DATA_GENERIC_ENHANCE) || - !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, - DATA_GENERIC_ENHANCE))) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", - __func__, inode->i_ino, - ei->blk, ei->fofs, ei->len); - return false; - } - } - if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", @@ -392,8 +376,6 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, node_page); - get_inline_info(inode, ri); fi->i_extra_isize = f2fs_has_extra_attr(inode) ? @@ -415,12 +397,6 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } - if (!sanity_check_inode(inode, node_page)) { - f2fs_put_page(node_page, 1); - f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); - return -EFSCORRUPTED; - } - /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -479,6 +455,22 @@ static int do_read_inode(struct inode *inode) } init_idisk_time(inode); + + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_page); + + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + + if (!sanity_check_extent_cache(inode)) { + f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -607,7 +599,7 @@ retry: void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_page_dirty(node_page); @@ -629,7 +621,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (et) { read_lock(&et->lock); - set_raw_extent(&et->largest, &ri->i_ext); + set_raw_read_extent(&et->largest, &ri->i_ext); read_unlock(&et->lock); } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b6c14c9c33a0..d879a295b688 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -258,8 +258,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, } F2FS_I(inode)->i_inline_xattr_size = xattr_size; - f2fs_init_extent_tree(inode, NULL); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -282,6 +280,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, f2fs_set_inode_flags(inode); + f2fs_init_extent_tree(inode); + trace_f2fs_new_inode(inode, 0); return inode; @@ -1002,12 +1002,20 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out; } + /* + * Copied from ext4_rename: we need to protect against old.inode + * directory getting converted from inline directory format into + * a normal one. + */ + if (S_ISDIR(old_inode->i_mode)) + inode_lock_nested(old_inode, I_MUTEX_NONDIR2); + err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) err = PTR_ERR(old_page); - goto out; + goto out_unlock_old; } if (S_ISDIR(old_inode->i_mode)) { @@ -1115,6 +1123,9 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, f2fs_unlock_op(sbi); + if (S_ISDIR(old_inode->i_mode)) + inode_unlock(old_inode); + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); @@ -1129,6 +1140,9 @@ out_dir: f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); +out_unlock_old: + if (S_ISDIR(old_inode->i_mode)) + inode_unlock(old_inode); out: iput(whiteout); return err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b9ee5a1176a0..07419c3e42a5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -85,10 +85,12 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == EXTENT_CACHE) { - mem_size = (atomic_read(&sbi->total_ext_tree) * + } else if (type == READ_EXTENT_CACHE) { + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + + mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + - atomic_read(&sbi->total_ext_node) * + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == DISCARD_CACHE) { @@ -859,7 +861,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + 1); - f2fs_update_extent_tree_range_compressed(dn->inode, + f2fs_update_read_extent_tree_range_compressed(dn->inode, index, blkaddr, F2FS_I(dn->inode)->i_cluster_size, c_len); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3c09cae058b0..0aa48704c77a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -146,7 +146,7 @@ enum mem_type { NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ - EXTENT_CACHE, /* indicates extent cache */ + READ_EXTENT_CACHE, /* indicates read extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b0fbdee16a96..cbbf95b99541 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -262,7 +262,7 @@ retry: f2fs_put_dnode(&dn); trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, - index, *old_addr, new_addr, recover); + index, old_addr ? *old_addr : 0, new_addr, recover); return 0; } @@ -452,8 +452,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) return; /* try to shrink extent cache when there is no enough memory */ - if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) @@ -1473,7 +1474,7 @@ retry: goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root, false)); + &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -3001,7 +3002,7 @@ next: mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root, false)); + &dcc->root)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index dd3c3c7a90ec..33c490e69ae3 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) return count > 0 ? count : 0; } -static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi, + enum extent_type type) { - return atomic_read(&sbi->total_zombie_tree) + - atomic_read(&sbi->total_ext_node); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + + return atomic_read(&eti->total_zombie_tree) + + atomic_read(&eti->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, @@ -53,8 +56,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, } spin_unlock(&f2fs_list_lock); - /* count extent cache entries */ - count += __count_extent_cache(sbi); + /* count read extent cache entries */ + count += __count_extent_cache(sbi, EX_READ); /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -99,8 +102,8 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; - /* shrink extent cache entries */ - freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + /* shrink read extent cache entries */ + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1); /* shrink clean nat cache entries */ if (freed < nr) @@ -130,7 +133,7 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { - f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5af05411818a..c46533d65372 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -810,10 +810,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, FASTBOOT); break; case Opt_extent_cache: - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - clear_opt(sbi, EXTENT_CACHE); + clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); @@ -1939,7 +1939,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nobarrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); - if (test_opt(sbi, EXTENT_CACHE)) + if (test_opt(sbi, READ_EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); @@ -2057,7 +2057,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, MERGE_CHECKPOINT); @@ -2198,7 +2198,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; - bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); @@ -2288,7 +2288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } /* disallow enable/disable extent_cache dynamically */ - if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index aa33c39be182..d387708977a5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -827,7 +827,7 @@ void wbc_detach_inode(struct writeback_control *wbc) * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ - if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) + if (hweight16(history) > WB_FRN_HIST_THR_SLOTS) inode_switch_wbs(inode, max_id); } diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index b8f9d627f241..e3312fbf4c09 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -20,7 +20,7 @@ static DEFINE_MUTEX(init_lock); static struct ksmbd_conn_ops default_conn_ops; LIST_HEAD(conn_list); -DEFINE_RWLOCK(conn_list_lock); +DECLARE_RWSEM(conn_list_lock); /** * ksmbd_conn_free() - free resources of the connection instance @@ -32,9 +32,9 @@ DEFINE_RWLOCK(conn_list_lock); */ void ksmbd_conn_free(struct ksmbd_conn *conn) { - write_lock(&conn_list_lock); + down_write(&conn_list_lock); list_del(&conn->conns_list); - write_unlock(&conn_list_lock); + up_write(&conn_list_lock); xa_destroy(&conn->sessions); kvfree(conn->request_buf); @@ -56,7 +56,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) return NULL; conn->need_neg = true; - conn->status = KSMBD_SESS_NEW; + ksmbd_conn_set_new(conn); conn->local_nls = load_nls("utf8"); if (!conn->local_nls) conn->local_nls = load_nls_default(); @@ -84,9 +84,9 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) spin_lock_init(&conn->llist_lock); INIT_LIST_HEAD(&conn->lock_list); - write_lock(&conn_list_lock); + down_write(&conn_list_lock); list_add(&conn->conns_list, &conn_list); - write_unlock(&conn_list_lock); + up_write(&conn_list_lock); return conn; } @@ -95,7 +95,7 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c) struct ksmbd_conn *t; bool ret = false; - read_lock(&conn_list_lock); + down_read(&conn_list_lock); list_for_each_entry(t, &conn_list, conns_list) { if (memcmp(t->ClientGUID, c->ClientGUID, SMB2_CLIENT_GUID_SIZE)) continue; @@ -103,7 +103,7 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c) ret = true; break; } - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); return ret; } @@ -149,19 +149,47 @@ int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) return ret; } -static void ksmbd_conn_lock(struct ksmbd_conn *conn) +void ksmbd_conn_lock(struct ksmbd_conn *conn) { mutex_lock(&conn->srv_mutex); } -static void ksmbd_conn_unlock(struct ksmbd_conn *conn) +void ksmbd_conn_unlock(struct ksmbd_conn *conn) { mutex_unlock(&conn->srv_mutex); } -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn) +void ksmbd_all_conn_set_status(u64 sess_id, u32 status) { + struct ksmbd_conn *conn; + + down_read(&conn_list_lock); + list_for_each_entry(conn, &conn_list, conns_list) { + if (conn->binding || xa_load(&conn->sessions, sess_id)) + WRITE_ONCE(conn->status, status); + } + up_read(&conn_list_lock); +} + +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) +{ + struct ksmbd_conn *bind_conn; + wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2); + + down_read(&conn_list_lock); + list_for_each_entry(bind_conn, &conn_list, conns_list) { + if (bind_conn == conn) + continue; + + if ((bind_conn->binding || xa_load(&bind_conn->sessions, sess_id)) && + !ksmbd_conn_releasing(bind_conn) && + atomic_read(&bind_conn->req_running)) { + wait_event(bind_conn->req_running_q, + atomic_read(&bind_conn->req_running) == 0); + } + } + up_read(&conn_list_lock); } int ksmbd_conn_write(struct ksmbd_work *work) @@ -245,7 +273,7 @@ bool ksmbd_conn_alive(struct ksmbd_conn *conn) if (!ksmbd_server_running()) return false; - if (conn->status == KSMBD_SESS_EXITING) + if (ksmbd_conn_exiting(conn)) return false; if (kthread_should_stop()) @@ -305,7 +333,7 @@ int ksmbd_conn_handler_loop(void *p) pdu_size = get_rfc1002_len(hdr_buf); ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); - if (conn->status == KSMBD_SESS_GOOD) + if (ksmbd_conn_good(conn)) max_allowed_pdu_size = SMB3_MAX_MSGSIZE + conn->vals->max_write_size; else @@ -314,7 +342,7 @@ int ksmbd_conn_handler_loop(void *p) if (pdu_size > max_allowed_pdu_size) { pr_err_ratelimited("PDU length(%u) excceed maximum allowed pdu size(%u) on connection(%d)\n", pdu_size, max_allowed_pdu_size, - conn->status); + READ_ONCE(conn->status)); break; } @@ -362,10 +390,10 @@ int ksmbd_conn_handler_loop(void *p) } out: + ksmbd_conn_set_releasing(conn); /* Wait till all reference dropped to the Server object*/ wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0); - if (IS_ENABLED(CONFIG_UNICODE)) utf8_unload(conn->um); unload_nls(conn->local_nls); @@ -409,7 +437,7 @@ static void stop_sessions(void) struct ksmbd_transport *t; again: - read_lock(&conn_list_lock); + down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { struct task_struct *task; @@ -418,14 +446,14 @@ again: if (task) ksmbd_debug(CONN, "Stop session handler %s/%d\n", task->comm, task_pid_nr(task)); - conn->status = KSMBD_SESS_EXITING; + ksmbd_conn_set_exiting(conn); if (t->ops->shutdown) { - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); t->ops->shutdown(t); - read_lock(&conn_list_lock); + down_read(&conn_list_lock); } } - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); if (!list_empty(&conn_list)) { schedule_timeout_interruptible(HZ / 10); /* 100ms */ diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 0e3a848defaf..ad8dfaa48ffb 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -26,7 +26,8 @@ enum { KSMBD_SESS_GOOD, KSMBD_SESS_EXITING, KSMBD_SESS_NEED_RECONNECT, - KSMBD_SESS_NEED_NEGOTIATE + KSMBD_SESS_NEED_NEGOTIATE, + KSMBD_SESS_RELEASING }; struct ksmbd_stats { @@ -140,10 +141,10 @@ struct ksmbd_transport { #define KSMBD_TCP_PEER_SOCKADDR(c) ((struct sockaddr *)&((c)->peer_addr)) extern struct list_head conn_list; -extern rwlock_t conn_list_lock; +extern struct rw_semaphore conn_list_lock; bool ksmbd_conn_alive(struct ksmbd_conn *conn); -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id); struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); @@ -162,6 +163,8 @@ void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); int ksmbd_conn_handler_loop(void *p); int ksmbd_conn_transport_init(void); void ksmbd_conn_transport_destroy(void); +void ksmbd_conn_lock(struct ksmbd_conn *conn); +void ksmbd_conn_unlock(struct ksmbd_conn *conn); /* * WARNING @@ -169,43 +172,60 @@ void ksmbd_conn_transport_destroy(void); * This is a hack. We will move status to a proper place once we land * a multi-sessions support. */ -static inline bool ksmbd_conn_good(struct ksmbd_work *work) +static inline bool ksmbd_conn_good(struct ksmbd_conn *conn) { - return work->conn->status == KSMBD_SESS_GOOD; + return READ_ONCE(conn->status) == KSMBD_SESS_GOOD; } -static inline bool ksmbd_conn_need_negotiate(struct ksmbd_work *work) +static inline bool ksmbd_conn_need_negotiate(struct ksmbd_conn *conn) { - return work->conn->status == KSMBD_SESS_NEED_NEGOTIATE; + return READ_ONCE(conn->status) == KSMBD_SESS_NEED_NEGOTIATE; } -static inline bool ksmbd_conn_need_reconnect(struct ksmbd_work *work) +static inline bool ksmbd_conn_need_reconnect(struct ksmbd_conn *conn) { - return work->conn->status == KSMBD_SESS_NEED_RECONNECT; + return READ_ONCE(conn->status) == KSMBD_SESS_NEED_RECONNECT; } -static inline bool ksmbd_conn_exiting(struct ksmbd_work *work) +static inline bool ksmbd_conn_exiting(struct ksmbd_conn *conn) { - return work->conn->status == KSMBD_SESS_EXITING; + return READ_ONCE(conn->status) == KSMBD_SESS_EXITING; } -static inline void ksmbd_conn_set_good(struct ksmbd_work *work) +static inline bool ksmbd_conn_releasing(struct ksmbd_conn *conn) { - work->conn->status = KSMBD_SESS_GOOD; + return READ_ONCE(conn->status) == KSMBD_SESS_RELEASING; } -static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_work *work) +static inline void ksmbd_conn_set_new(struct ksmbd_conn *conn) { - work->conn->status = KSMBD_SESS_NEED_NEGOTIATE; + WRITE_ONCE(conn->status, KSMBD_SESS_NEW); } -static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_work *work) +static inline void ksmbd_conn_set_good(struct ksmbd_conn *conn) { - work->conn->status = KSMBD_SESS_NEED_RECONNECT; + WRITE_ONCE(conn->status, KSMBD_SESS_GOOD); } -static inline void ksmbd_conn_set_exiting(struct ksmbd_work *work) +static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_conn *conn) { - work->conn->status = KSMBD_SESS_EXITING; + WRITE_ONCE(conn->status, KSMBD_SESS_NEED_NEGOTIATE); } + +static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_conn *conn) +{ + WRITE_ONCE(conn->status, KSMBD_SESS_NEED_RECONNECT); +} + +static inline void ksmbd_conn_set_exiting(struct ksmbd_conn *conn) +{ + WRITE_ONCE(conn->status, KSMBD_SESS_EXITING); +} + +static inline void ksmbd_conn_set_releasing(struct ksmbd_conn *conn) +{ + WRITE_ONCE(conn->status, KSMBD_SESS_RELEASING); +} + +void ksmbd_all_conn_set_status(u64 sess_id, u32 status); #endif /* __CONNECTION_H__ */ diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c index f19de20c2960..f07a05f37651 100644 --- a/fs/ksmbd/mgmt/tree_connect.c +++ b/fs/ksmbd/mgmt/tree_connect.c @@ -137,6 +137,9 @@ int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess) struct ksmbd_tree_connect *tc; unsigned long id; + if (!sess) + return -EINVAL; + xa_for_each(&sess->tree_conns, id, tc) ret |= ksmbd_tree_conn_disconnect(sess, tc); xa_destroy(&sess->tree_conns); diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c index 92b1603b5abe..ea4b56d570fb 100644 --- a/fs/ksmbd/mgmt/user_session.c +++ b/fs/ksmbd/mgmt/user_session.c @@ -30,15 +30,15 @@ struct ksmbd_session_rpc { static void free_channel_list(struct ksmbd_session *sess) { - struct channel *chann, *tmp; + struct channel *chann; + unsigned long index; - write_lock(&sess->chann_lock); - list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list, - chann_list) { - list_del(&chann->chann_list); + xa_for_each(&sess->ksmbd_chann_list, index, chann) { + xa_erase(&sess->ksmbd_chann_list, index); kfree(chann); } - write_unlock(&sess->chann_lock); + + xa_destroy(&sess->ksmbd_chann_list); } static void __session_rpc_close(struct ksmbd_session *sess, @@ -153,10 +153,6 @@ void ksmbd_session_destroy(struct ksmbd_session *sess) if (!sess) return; - down_write(&sessions_table_lock); - hash_del(&sess->hlist); - up_write(&sessions_table_lock); - if (sess->user) ksmbd_free_user(sess->user); @@ -174,76 +170,101 @@ static struct ksmbd_session *__session_lookup(unsigned long long id) struct ksmbd_session *sess; hash_for_each_possible(sessions_table, sess, hlist, id) { - if (id == sess->id) + if (id == sess->id) { + sess->last_active = jiffies; return sess; + } } return NULL; } +static void ksmbd_expire_session(struct ksmbd_conn *conn) +{ + unsigned long id; + struct ksmbd_session *sess; + + down_write(&sessions_table_lock); + xa_for_each(&conn->sessions, id, sess) { + if (sess->state != SMB2_SESSION_VALID || + time_after(jiffies, + sess->last_active + SMB2_SESSION_TIMEOUT)) { + xa_erase(&conn->sessions, sess->id); + hash_del(&sess->hlist); + ksmbd_session_destroy(sess); + continue; + } + } + up_write(&sessions_table_lock); +} + int ksmbd_session_register(struct ksmbd_conn *conn, struct ksmbd_session *sess) { sess->dialect = conn->dialect; memcpy(sess->ClientGUID, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE); + ksmbd_expire_session(conn); return xa_err(xa_store(&conn->sessions, sess->id, sess, GFP_KERNEL)); } static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess) { - struct channel *chann, *tmp; - - write_lock(&sess->chann_lock); - list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list, - chann_list) { - if (chann->conn == conn) { - list_del(&chann->chann_list); - kfree(chann); - write_unlock(&sess->chann_lock); - return 0; - } - } - write_unlock(&sess->chann_lock); + struct channel *chann; - return -ENOENT; + chann = xa_erase(&sess->ksmbd_chann_list, (long)conn); + if (!chann) + return -ENOENT; + + kfree(chann); + return 0; } void ksmbd_sessions_deregister(struct ksmbd_conn *conn) { struct ksmbd_session *sess; + unsigned long id; + down_write(&sessions_table_lock); if (conn->binding) { int bkt; + struct hlist_node *tmp; - down_write(&sessions_table_lock); - hash_for_each(sessions_table, bkt, sess, hlist) { - if (!ksmbd_chann_del(conn, sess)) { - up_write(&sessions_table_lock); - goto sess_destroy; + hash_for_each_safe(sessions_table, bkt, tmp, sess, hlist) { + if (!ksmbd_chann_del(conn, sess) && + xa_empty(&sess->ksmbd_chann_list)) { + hash_del(&sess->hlist); + ksmbd_session_destroy(sess); } } - up_write(&sessions_table_lock); - } else { - unsigned long id; - - xa_for_each(&conn->sessions, id, sess) { - if (!ksmbd_chann_del(conn, sess)) - goto sess_destroy; - } } - return; + xa_for_each(&conn->sessions, id, sess) { + unsigned long chann_id; + struct channel *chann; + + xa_for_each(&sess->ksmbd_chann_list, chann_id, chann) { + if (chann->conn != conn) + ksmbd_conn_set_exiting(chann->conn); + } -sess_destroy: - if (list_empty(&sess->ksmbd_chann_list)) { - xa_erase(&conn->sessions, sess->id); - ksmbd_session_destroy(sess); + ksmbd_chann_del(conn, sess); + if (xa_empty(&sess->ksmbd_chann_list)) { + xa_erase(&conn->sessions, sess->id); + hash_del(&sess->hlist); + ksmbd_session_destroy(sess); + } } + up_write(&sessions_table_lock); } struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, unsigned long long id) { - return xa_load(&conn->sessions, id); + struct ksmbd_session *sess; + + sess = xa_load(&conn->sessions, id); + if (sess) + sess->last_active = jiffies; + return sess; } struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) @@ -252,6 +273,8 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) down_read(&sessions_table_lock); sess = __session_lookup(id); + if (sess) + sess->last_active = jiffies; up_read(&sessions_table_lock); return sess; @@ -320,6 +343,9 @@ static struct ksmbd_session *__session_create(int protocol) struct ksmbd_session *sess; int ret; + if (protocol != CIFDS_SESSION_FLAG_SMB2) + return NULL; + sess = kzalloc(sizeof(struct ksmbd_session), GFP_KERNEL); if (!sess) return NULL; @@ -327,32 +353,24 @@ static struct ksmbd_session *__session_create(int protocol) if (ksmbd_init_file_table(&sess->file_table)) goto error; + sess->last_active = jiffies; + sess->state = SMB2_SESSION_IN_PROGRESS; set_session_flag(sess, protocol); xa_init(&sess->tree_conns); - INIT_LIST_HEAD(&sess->ksmbd_chann_list); + xa_init(&sess->ksmbd_chann_list); INIT_LIST_HEAD(&sess->rpc_handle_list); sess->sequence_number = 1; - rwlock_init(&sess->chann_lock); - - switch (protocol) { - case CIFDS_SESSION_FLAG_SMB2: - ret = __init_smb2_session(sess); - break; - default: - ret = -EINVAL; - break; - } + ret = __init_smb2_session(sess); if (ret) goto error; ida_init(&sess->tree_conn_ida); - if (protocol == CIFDS_SESSION_FLAG_SMB2) { - down_write(&sessions_table_lock); - hash_add(sessions_table, &sess->hlist, sess->id); - up_write(&sessions_table_lock); - } + down_write(&sessions_table_lock); + hash_add(sessions_table, &sess->hlist, sess->id); + up_write(&sessions_table_lock); + return sess; error: diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h index 8934b8ee275b..51f38e5b61ab 100644 --- a/fs/ksmbd/mgmt/user_session.h +++ b/fs/ksmbd/mgmt/user_session.h @@ -21,7 +21,6 @@ struct ksmbd_file_table; struct channel { __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; struct ksmbd_conn *conn; - struct list_head chann_list; }; struct preauth_session { @@ -50,8 +49,7 @@ struct ksmbd_session { char sess_key[CIFS_KEY_SIZE]; struct hlist_node hlist; - rwlock_t chann_lock; - struct list_head ksmbd_chann_list; + struct xarray ksmbd_chann_list; struct xarray tree_conns; struct ida tree_conn_ida; struct list_head rpc_handle_list; @@ -61,6 +59,7 @@ struct ksmbd_session { __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; struct ksmbd_file_table file_table; + unsigned long last_active; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index 8c2bc513445c..8a0ad399f245 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -93,7 +93,8 @@ static inline int check_conn_state(struct ksmbd_work *work) { struct smb_hdr *rsp_hdr; - if (ksmbd_conn_exiting(work) || ksmbd_conn_need_reconnect(work)) { + if (ksmbd_conn_exiting(work->conn) || + ksmbd_conn_need_reconnect(work->conn)) { rsp_hdr = work->response_buf; rsp_hdr->Status.CifsError = STATUS_CONNECTION_DISCONNECTED; return 1; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index acd66fb40c5f..8f96b96dbac1 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -74,14 +74,7 @@ static inline bool check_session_id(struct ksmbd_conn *conn, u64 id) struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn *conn) { - struct channel *chann; - - list_for_each_entry(chann, &sess->ksmbd_chann_list, chann_list) { - if (chann->conn == conn) - return chann; - } - - return NULL; + return xa_load(&sess->ksmbd_chann_list, (long)conn); } /** @@ -254,7 +247,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) rsp = smb2_get_msg(work->response_buf); - WARN_ON(ksmbd_conn_good(work)); + WARN_ON(ksmbd_conn_good(conn)); rsp->StructureSize = cpu_to_le16(65); ksmbd_debug(SMB, "conn->dialect 0x%x\n", conn->dialect); @@ -284,7 +277,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; conn->use_spnego = true; - ksmbd_conn_set_need_negotiate(work); + ksmbd_conn_set_need_negotiate(conn); return 0; } @@ -574,7 +567,7 @@ int smb2_check_user_session(struct ksmbd_work *work) cmd == SMB2_SESSION_SETUP_HE) return 0; - if (!ksmbd_conn_good(work)) + if (!ksmbd_conn_good(conn)) return -EINVAL; sess_id = le64_to_cpu(req_hdr->SessionId); @@ -592,6 +585,7 @@ static void destroy_previous_session(struct ksmbd_conn *conn, struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id); struct ksmbd_user *prev_user; struct channel *chann; + long index; if (!prev_sess) return; @@ -605,10 +599,8 @@ static void destroy_previous_session(struct ksmbd_conn *conn, return; prev_sess->state = SMB2_SESSION_EXPIRED; - write_lock(&prev_sess->chann_lock); - list_for_each_entry(chann, &prev_sess->ksmbd_chann_list, chann_list) - chann->conn->status = KSMBD_SESS_EXITING; - write_unlock(&prev_sess->chann_lock); + xa_for_each(&prev_sess->ksmbd_chann_list, index, chann) + ksmbd_conn_set_exiting(chann->conn); } /** @@ -1075,7 +1067,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_debug(SMB, "Received negotiate request\n"); conn->need_neg = false; - if (ksmbd_conn_good(work)) { + if (ksmbd_conn_good(conn)) { pr_err("conn->tcp_status is already in CifsGood State\n"); work->send_no_response = 1; return rc; @@ -1230,7 +1222,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) } conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode); - ksmbd_conn_set_need_negotiate(work); + ksmbd_conn_set_need_negotiate(conn); err_out: if (rc < 0) @@ -1520,19 +1512,14 @@ static int ntlm_authenticate(struct ksmbd_work *work) binding_session: if (conn->dialect >= SMB30_PROT_ID) { - read_lock(&sess->chann_lock); chann = lookup_chann_list(sess, conn); - read_unlock(&sess->chann_lock); if (!chann) { chann = kmalloc(sizeof(struct channel), GFP_KERNEL); if (!chann) return -ENOMEM; chann->conn = conn; - INIT_LIST_HEAD(&chann->chann_list); - write_lock(&sess->chann_lock); - list_add(&chann->chann_list, &sess->ksmbd_chann_list); - write_unlock(&sess->chann_lock); + xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL); } } @@ -1606,19 +1593,14 @@ static int krb5_authenticate(struct ksmbd_work *work) } if (conn->dialect >= SMB30_PROT_ID) { - read_lock(&sess->chann_lock); chann = lookup_chann_list(sess, conn); - read_unlock(&sess->chann_lock); if (!chann) { chann = kmalloc(sizeof(struct channel), GFP_KERNEL); if (!chann) return -ENOMEM; chann->conn = conn; - INIT_LIST_HEAD(&chann->chann_list); - write_lock(&sess->chann_lock); - list_add(&chann->chann_list, &sess->ksmbd_chann_list); - write_unlock(&sess->chann_lock); + xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL); } } @@ -1661,6 +1643,7 @@ int smb2_sess_setup(struct ksmbd_work *work) rsp->SecurityBufferLength = 0; inc_rfc1001_len(work->response_buf, 9); + ksmbd_conn_lock(conn); if (!req->hdr.SessionId) { sess = ksmbd_smb2_session_create(); if (!sess) { @@ -1708,6 +1691,12 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; } + if (ksmbd_conn_need_reconnect(conn)) { + rc = -EFAULT; + sess = NULL; + goto out_err; + } + if (ksmbd_session_lookup(conn, sess_id)) { rc = -EACCES; goto out_err; @@ -1732,12 +1721,20 @@ int smb2_sess_setup(struct ksmbd_work *work) rc = -ENOENT; goto out_err; } + + if (sess->state == SMB2_SESSION_EXPIRED) { + rc = -EFAULT; + goto out_err; + } + + if (ksmbd_conn_need_reconnect(conn)) { + rc = -EFAULT; + sess = NULL; + goto out_err; + } } work->sess = sess; - if (sess->state == SMB2_SESSION_EXPIRED) - sess->state = SMB2_SESSION_IN_PROGRESS; - negblob_off = le16_to_cpu(req->SecurityBufferOffset); negblob_len = le16_to_cpu(req->SecurityBufferLength); if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) || @@ -1767,8 +1764,10 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; } - ksmbd_conn_set_good(work); - sess->state = SMB2_SESSION_VALID; + if (!ksmbd_conn_need_reconnect(conn)) { + ksmbd_conn_set_good(conn); + sess->state = SMB2_SESSION_VALID; + } kfree(sess->Preauth_HashValue); sess->Preauth_HashValue = NULL; } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) { @@ -1790,8 +1789,10 @@ int smb2_sess_setup(struct ksmbd_work *work) if (rc) goto out_err; - ksmbd_conn_set_good(work); - sess->state = SMB2_SESSION_VALID; + if (!ksmbd_conn_need_reconnect(conn)) { + ksmbd_conn_set_good(conn); + sess->state = SMB2_SESSION_VALID; + } if (conn->binding) { struct preauth_session *preauth_sess; @@ -1859,14 +1860,17 @@ out_err: if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION) try_delay = true; - xa_erase(&conn->sessions, sess->id); - ksmbd_session_destroy(sess); - work->sess = NULL; - if (try_delay) + sess->last_active = jiffies; + sess->state = SMB2_SESSION_EXPIRED; + if (try_delay) { + ksmbd_conn_set_need_reconnect(conn); ssleep(5); + ksmbd_conn_set_need_negotiate(conn); + } } } + ksmbd_conn_unlock(conn); return rc; } @@ -2091,21 +2095,25 @@ int smb2_session_logoff(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; struct smb2_logoff_rsp *rsp = smb2_get_msg(work->response_buf); - struct ksmbd_session *sess = work->sess; + struct ksmbd_session *sess; + struct smb2_logoff_req *req = smb2_get_msg(work->request_buf); + u64 sess_id = le64_to_cpu(req->hdr.SessionId); rsp->StructureSize = cpu_to_le16(4); inc_rfc1001_len(work->response_buf, 4); ksmbd_debug(SMB, "request\n"); - /* setting CifsExiting here may race with start_tcp_sess */ - ksmbd_conn_set_need_reconnect(work); + ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT); ksmbd_close_session_fds(work); - ksmbd_conn_wait_idle(conn); + ksmbd_conn_wait_idle(conn, sess_id); + /* + * Re-lookup session to validate if session is deleted + * while waiting request complete + */ + sess = ksmbd_session_lookup_all(conn, sess_id); if (ksmbd_tree_conn_session_logoff(sess)) { - struct smb2_logoff_req *req = smb2_get_msg(work->request_buf); - ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); @@ -2117,9 +2125,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_free_user(sess->user); sess->user = NULL; - - /* let start_tcp_sess free connection info now */ - ksmbd_conn_set_need_negotiate(work); + ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE); return 0; } @@ -6947,7 +6953,7 @@ int smb2_lock(struct ksmbd_work *work) nolock = 1; /* check locks in connection list */ - read_lock(&conn_list_lock); + down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { spin_lock(&conn->llist_lock); list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) { @@ -6964,7 +6970,7 @@ int smb2_lock(struct ksmbd_work *work) list_del(&cmp_lock->flist); list_del(&cmp_lock->clist); spin_unlock(&conn->llist_lock); - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); locks_free_lock(cmp_lock->fl); kfree(cmp_lock); @@ -6986,7 +6992,7 @@ int smb2_lock(struct ksmbd_work *work) cmp_lock->start > smb_lock->start && cmp_lock->start < smb_lock->end) { spin_unlock(&conn->llist_lock); - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); pr_err("previous lock conflict with zero byte lock range\n"); goto out; } @@ -6995,7 +7001,7 @@ int smb2_lock(struct ksmbd_work *work) smb_lock->start > cmp_lock->start && smb_lock->start < cmp_lock->end) { spin_unlock(&conn->llist_lock); - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); pr_err("current lock conflict with zero byte lock range\n"); goto out; } @@ -7006,14 +7012,14 @@ int smb2_lock(struct ksmbd_work *work) cmp_lock->end >= smb_lock->end)) && !cmp_lock->zero_len && !smb_lock->zero_len) { spin_unlock(&conn->llist_lock); - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); pr_err("Not allow lock operation on exclusive lock range\n"); goto out; } } spin_unlock(&conn->llist_lock); } - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); out_check_cl: if (smb_lock->fl->fl_type == F_UNLCK && nolock) { pr_err("Try to unlock nolocked range\n"); @@ -8428,14 +8434,11 @@ int smb3_check_sign_req(struct ksmbd_work *work) if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { signing_key = work->sess->smb3signingkey; } else { - read_lock(&work->sess->chann_lock); chann = lookup_chann_list(work->sess, conn); if (!chann) { - read_unlock(&work->sess->chann_lock); return 0; } signing_key = chann->smb3signingkey; - read_unlock(&work->sess->chann_lock); } if (!signing_key) { @@ -8495,14 +8498,11 @@ void smb3_set_sign_rsp(struct ksmbd_work *work) le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { signing_key = work->sess->smb3signingkey; } else { - read_lock(&work->sess->chann_lock); chann = lookup_chann_list(work->sess, work->conn); if (!chann) { - read_unlock(&work->sess->chann_lock); return; } signing_key = chann->smb3signingkey; - read_unlock(&work->sess->chann_lock); } if (!signing_key) diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index f4baa9800f6e..dd10f8031606 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -61,6 +61,8 @@ struct preauth_integrity_info { #define SMB2_SESSION_IN_PROGRESS BIT(0) #define SMB2_SESSION_VALID BIT(1) +#define SMB2_SESSION_TIMEOUT (10 * HZ) + struct create_durable_req_v2 { struct create_context ccontext; __u8 Name[8]; diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 20e85e2701f2..eff7a1d793f0 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -333,7 +333,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, if (length == -EINTR) { total_read = -ESHUTDOWN; break; - } else if (conn->status == KSMBD_SESS_NEED_RECONNECT) { + } else if (ksmbd_conn_need_reconnect(conn)) { total_read = -EAGAIN; break; } else if (length == -ERESTARTSYS || length == -EAGAIN) { diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 49cfe2ae6d23..993375f0db67 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -65,7 +65,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, struct fsnotify_event *fsn_event; struct fsnotify_group *group = inode_mark->group; int ret; - int len = 0; + int len = 0, wd; int alloc_len = sizeof(struct inotify_event_info); struct mem_cgroup *old_memcg; @@ -81,6 +81,13 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, fsn_mark); /* + * We can be racing with mark being detached. Don't report event with + * invalid wd. + */ + wd = READ_ONCE(i_mark->wd); + if (wd == -1) + return 0; + /* * Whoever is interested in the event, pays for the allocation. Do not * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. @@ -110,7 +117,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, fsn_event = &event->fse; fsnotify_init_event(fsn_event); event->mask = mask; - event->wd = i_mark->wd; + event->wd = wd; event->sync_cookie = cookie; event->name_len = len; if (len) diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 45f95c1cb258..e0cdc91d88a8 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -661,7 +661,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) if (!wnd->bits_last) wnd->bits_last = wbits; - wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); + wnd->free_bits = + kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); if (!wnd->free_bits) return -ENOMEM; diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index bc22cc321a74..a9549e73081f 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -86,6 +86,16 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, __putname(uni); } + /* + * Check for a null pointer + * If the MFT record of ntfs inode is not a base record, inode->i_op can be NULL. + * This causes null pointer dereference in d_splice_alias(). + */ + if (!IS_ERR_OR_NULL(inode) && !inode->i_op) { + iput(inode); + inode = ERR_PTR(-EINVAL); + } + return d_splice_alias(inode, dentry); } diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 9cc396b117bf..0f38d558169a 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -436,9 +436,6 @@ static inline u64 attr_svcn(const struct ATTRIB *attr) return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0; } -/* The size of resident attribute by its resident size. */ -#define BYTES_PER_RESIDENT(b) (0x18 + (b)) - static_assert(sizeof(struct ATTRIB) == 0x48); static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08); static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 48f2d60bd78a..72f2b373221e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1281,7 +1281,10 @@ out: * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure + * @table: the top-level table structure without any child. This table + * should not be free'd after registration. So it should not be + * used on stack. It can either be a global or dynamically allocated + * by the caller and free'd later after sysctl unregistration. * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. @@ -1302,9 +1305,12 @@ out: * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines + * XXX: we should eventually modify these to use long min / max [0] + * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. + * under /proc; non-leaf nodes (where child is not NULL) are not allowed, + * sysctl_check_table() verifies this. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - @@ -1346,7 +1352,7 @@ struct ctl_table_header *__register_sysctl_table( spin_lock(&sysctl_lock); dir = &set->dir; - /* Reference moved down the diretory tree get_subdir */ + /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); @@ -1363,6 +1369,11 @@ struct ctl_table_header *__register_sysctl_table( if (namelen == 0) continue; + /* + * namelen ensures if name is "foo/bar/yay" only foo is + * registered first. We traverse as if using mkdir -p and + * return a ctl_dir for the last directory entry. + */ dir = get_subdir(dir, name, namelen); if (IS_ERR(dir)) goto fail; @@ -1388,8 +1399,15 @@ fail: /** * register_sysctl - register a sysctl table - * @path: The path to the directory the sysctl table is in. - * @table: the table structure + * @path: The path to the directory the sysctl table is in. If the path + * doesn't exist we will create it for you. + * @table: the table structure. The calller must ensure the life of the @table + * will be kept during the lifetime use of the syctl. It must not be freed + * until unregister_sysctl_table() is called with the given returned table + * with this registration. If your code is non modular then you don't need + * to call unregister_sysctl_table() and can instead use something like + * register_sysctl_init() which does not care for the result of the syctl + * registration. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. @@ -1405,8 +1423,11 @@ EXPORT_SYMBOL(register_sysctl); /** * __register_sysctl_init() - register sysctl table to path - * @path: path name for sysctl base - * @table: This is the sysctl table that needs to be registered to the path + * @path: path name for sysctl base. If that path doesn't exist we will create + * it for you. + * @table: This is the sysctl table that needs to be registered to the path. + * The caller must ensure the life of the @table will be kept during the + * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails * @@ -1418,10 +1439,7 @@ EXPORT_SYMBOL(register_sysctl); * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * - * Context: Can only be called after your respective sysctl base path has been - * registered. So for instance, most base directories are registered early on - * init before init levels are processed through proc_sys_init() and - * sysctl_init_bases(). + * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name) @@ -1551,6 +1569,7 @@ out: * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. + * We are slowly deprecating this call so avoid its use. * * See __register_sysctl_table for more details. */ @@ -1622,6 +1641,7 @@ err_register_leaves: * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. + * We are slowly deprecating this caller so avoid future uses of it. * * See __register_sysctl_paths for more details. */ |