diff options
Diffstat (limited to 'fs')
453 files changed, 12287 insertions, 9816 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 83eab52fb3f6..b0e42b6a96b9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -39,6 +39,7 @@ config FS_DAX depends on MMU depends on !(ARM || MIPS || SPARC) select FS_IOMAP + select DAX help Direct Access (DAX) can be used on memory-backed block devices. If the block device supports DAX and the filesystem supports DAX, diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 2f8bab390d13..773749be8290 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -7,7 +7,7 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/amigaffs.h> +#include "amigaffs.h" #include <linux/mutex.h> #include <linux/workqueue.h> @@ -173,7 +173,7 @@ extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); extern int affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); -extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry, +extern int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h new file mode 100644 index 000000000000..43b41c06aa37 --- /dev/null +++ b/fs/affs/amigaffs.h @@ -0,0 +1,144 @@ +#ifndef AMIGAFFS_H +#define AMIGAFFS_H + +#include <linux/types.h> +#include <asm/byteorder.h> + +#define FS_OFS 0x444F5300 +#define FS_FFS 0x444F5301 +#define FS_INTLOFS 0x444F5302 +#define FS_INTLFFS 0x444F5303 +#define FS_DCOFS 0x444F5304 +#define FS_DCFFS 0x444F5305 +#define MUFS_FS 0x6d754653 /* 'muFS' */ +#define MUFS_OFS 0x6d754600 /* 'muF\0' */ +#define MUFS_FFS 0x6d754601 /* 'muF\1' */ +#define MUFS_INTLOFS 0x6d754602 /* 'muF\2' */ +#define MUFS_INTLFFS 0x6d754603 /* 'muF\3' */ +#define MUFS_DCOFS 0x6d754604 /* 'muF\4' */ +#define MUFS_DCFFS 0x6d754605 /* 'muF\5' */ + +#define T_SHORT 2 +#define T_LIST 16 +#define T_DATA 8 + +#define ST_LINKFILE -4 +#define ST_FILE -3 +#define ST_ROOT 1 +#define ST_USERDIR 2 +#define ST_SOFTLINK 3 +#define ST_LINKDIR 4 + +#define AFFS_ROOT_BMAPS 25 + +struct affs_date { + __be32 days; + __be32 mins; + __be32 ticks; +}; + +struct affs_short_date { + __be16 days; + __be16 mins; + __be16 ticks; +}; + +struct affs_root_head { + __be32 ptype; + __be32 spare1; + __be32 spare2; + __be32 hash_size; + __be32 spare3; + __be32 checksum; + __be32 hashtable[1]; +}; + +struct affs_root_tail { + __be32 bm_flag; + __be32 bm_blk[AFFS_ROOT_BMAPS]; + __be32 bm_ext; + struct affs_date root_change; + u8 disk_name[32]; + __be32 spare1; + __be32 spare2; + struct affs_date disk_change; + struct affs_date disk_create; + __be32 spare3; + __be32 spare4; + __be32 dcache; + __be32 stype; +}; + +struct affs_head { + __be32 ptype; + __be32 key; + __be32 block_count; + __be32 spare1; + __be32 first_data; + __be32 checksum; + __be32 table[1]; +}; + +struct affs_tail { + __be32 spare1; + __be16 uid; + __be16 gid; + __be32 protect; + __be32 size; + u8 comment[92]; + struct affs_date change; + u8 name[32]; + __be32 spare2; + __be32 original; + __be32 link_chain; + __be32 spare[5]; + __be32 hash_chain; + __be32 parent; + __be32 extension; + __be32 stype; +}; + +struct slink_front +{ + __be32 ptype; + __be32 key; + __be32 spare1[3]; + __be32 checksum; + u8 symname[1]; /* depends on block size */ +}; + +struct affs_data_head +{ + __be32 ptype; + __be32 key; + __be32 sequence; + __be32 size; + __be32 next; + __be32 checksum; + u8 data[1]; /* depends on block size */ +}; + +/* Permission bits */ + +#define FIBF_OTR_READ 0x8000 +#define FIBF_OTR_WRITE 0x4000 +#define FIBF_OTR_EXECUTE 0x2000 +#define FIBF_OTR_DELETE 0x1000 +#define FIBF_GRP_READ 0x0800 +#define FIBF_GRP_WRITE 0x0400 +#define FIBF_GRP_EXECUTE 0x0200 +#define FIBF_GRP_DELETE 0x0100 + +#define FIBF_HIDDEN 0x0080 +#define FIBF_SCRIPT 0x0040 +#define FIBF_PURE 0x0020 /* no use under linux */ +#define FIBF_ARCHIVED 0x0010 /* never set, always cleared on write */ +#define FIBF_NOREAD 0x0008 /* 0 means allowed */ +#define FIBF_NOWRITE 0x0004 /* 0 means allowed */ +#define FIBF_NOEXECUTE 0x0002 /* 0 means allowed, ignored under linux */ +#define FIBF_NODELETE 0x0001 /* 0 means allowed */ + +#define FIBF_OWNER 0x000F /* Bits pertaining to owner */ +#define FIBF_MASK 0xEE0E /* Bits modified by Linux */ + +#endif diff --git a/fs/affs/dir.c b/fs/affs/dir.c index f1e7294381c5..591ecd7f3063 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -35,7 +35,7 @@ const struct inode_operations affs_dir_inode_operations = { .symlink = affs_symlink, .mkdir = affs_mkdir, .rmdir = affs_rmdir, - .rename = affs_rename, + .rename = affs_rename2, .setattr = affs_notify_change, }; diff --git a/fs/affs/file.c b/fs/affs/file.c index 0deec9cc2362..196ee7f6fdc4 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -499,7 +499,7 @@ affs_getemptyblk_ino(struct inode *inode, int block) } static int -affs_do_readpage_ofs(struct page *page, unsigned to) +affs_do_readpage_ofs(struct page *page, unsigned to, int create) { struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; @@ -518,7 +518,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to) boff = tmp % bsize; while (pos < to) { - bh = affs_bread_ino(inode, bidx, 0); + bh = affs_bread_ino(inode, bidx, create); if (IS_ERR(bh)) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); @@ -620,7 +620,7 @@ affs_readpage_ofs(struct file *file, struct page *page) memset(page_address(page) + to, 0, PAGE_SIZE - to); } - err = affs_do_readpage_ofs(page, to); + err = affs_do_readpage_ofs(page, to, 0); if (!err) SetPageUptodate(page); unlock_page(page); @@ -657,7 +657,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(page, PAGE_SIZE); + err = affs_do_readpage_ofs(page, PAGE_SIZE, 1); if (err) { unlock_page(page); put_page(page); @@ -679,7 +679,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, int written; from = pos & (PAGE_SIZE - 1); - to = pos + len; + to = from + len; /* * XXX: not sure if this can handle short copies (len < copied), but * we don't have to, because the page should always be uptodate here, diff --git a/fs/affs/inode.c b/fs/affs/inode.c index abcc59899229..fd4ef3c40e40 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &affs_file_operations; break; case ST_SOFTLINK: + inode->i_size = strlen((char *)AFFS_HEAD(bh)->table); inode->i_mode |= S_IFLNK; inode_nohighmem(inode); inode->i_op = &affs_symlink_inode_operations; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 96dd1d09a273..46d3ace6761d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -365,6 +365,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) symname++; } *p = 0; + inode->i_size = i + 1; mark_buffer_dirty_inode(bh, inode); affs_brelse(bh); mark_inode_dirty(inode); @@ -393,21 +394,14 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) return affs_add_entry(dir, inode, dentry, ST_LINKFILE); } -int +static int affs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) + struct inode *new_dir, struct dentry *new_dentry) { struct super_block *sb = old_dir->i_sb; struct buffer_head *bh = NULL; int retval; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, - old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); - retval = affs_check_name(new_dentry->d_name.name, new_dentry->d_name.len, affs_nofilenametruncate(old_dentry)); @@ -447,6 +441,76 @@ done: return retval; } +static int +affs_xrename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + + struct super_block *sb = old_dir->i_sb; + struct buffer_head *bh_old = NULL; + struct buffer_head *bh_new = NULL; + int retval; + + bh_old = affs_bread(sb, d_inode(old_dentry)->i_ino); + if (!bh_old) + return -EIO; + + bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino); + if (!bh_new) + return -EIO; + + /* Remove old header from its parent directory. */ + affs_lock_dir(old_dir); + retval = affs_remove_hash(old_dir, bh_old); + affs_unlock_dir(old_dir); + if (retval) + goto done; + + /* Remove new header from its parent directory. */ + affs_lock_dir(new_dir); + retval = affs_remove_hash(new_dir, bh_new); + affs_unlock_dir(new_dir); + if (retval) + goto done; + + /* Insert old into the new directory with the new name. */ + affs_copy_name(AFFS_TAIL(sb, bh_old)->name, new_dentry); + affs_fix_checksum(sb, bh_old); + affs_lock_dir(new_dir); + retval = affs_insert_hash(new_dir, bh_old); + affs_unlock_dir(new_dir); + + /* Insert new into the old directory with the old name. */ + affs_copy_name(AFFS_TAIL(sb, bh_new)->name, old_dentry); + affs_fix_checksum(sb, bh_new); + affs_lock_dir(old_dir); + retval = affs_insert_hash(old_dir, bh_new); + affs_unlock_dir(old_dir); +done: + mark_buffer_dirty_inode(bh_old, new_dir); + mark_buffer_dirty_inode(bh_new, old_dir); + affs_brelse(bh_old); + affs_brelse(bh_new); + return retval; +} + +int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, + old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); + + if (flags & RENAME_EXCHANGE) + return affs_xrename(old_dir, old_dentry, new_dir, new_dentry); + + return affs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static struct dentry *affs_get_parent(struct dentry *child) { struct inode *parent; @@ -477,11 +541,6 @@ static struct inode *affs_nfs_get_inode(struct super_block *sb, u64 ino, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { - iput(inode); - return ERR_PTR(-ESTALE); - } - return inode; } diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 4f64b95d57bd..095c54165dfd 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -27,6 +27,7 @@ kafs-objs := \ vlocation.o \ vnode.o \ volume.o \ - write.o + write.o \ + xattr.o obj-$(CONFIG_AFS_FS) := kafs.o diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 3062cceb5c2a..782d4d05a53b 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -350,7 +350,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_server *server; - struct uuid_v1 *r; + struct afs_uuid *r; unsigned loop; __be32 *b; int ret; @@ -380,7 +380,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) } _debug("unmarshall UUID"); - call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); if (!call->request) return -ENOMEM; @@ -453,7 +453,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) static void SRXAFSCB_ProbeUuid(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, work); - struct uuid_v1 *r = call->request; + struct afs_uuid *r = call->request; struct { __be32 match; @@ -476,7 +476,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) */ static int afs_deliver_cb_probe_uuid(struct afs_call *call) { - struct uuid_v1 *r; + struct afs_uuid *r; unsigned loop; __be32 *b; int ret; @@ -502,15 +502,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) } _debug("unmarshall UUID"); - call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); if (!call->request) return -ENOMEM; b = call->buffer; r = call->request; - r->time_low = b[0]; - r->time_mid = htons(ntohl(b[1])); - r->time_hi_and_version = htons(ntohl(b[2])); + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 949f960337f5..613a77058263 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -61,6 +61,7 @@ const struct inode_operations afs_dir_inode_operations = { .permission = afs_permission, .getattr = afs_getattr, .setattr = afs_setattr, + .listxattr = afs_listxattr, }; const struct dentry_operations afs_fs_dentry_operations = { diff --git a/fs/afs/file.c b/fs/afs/file.c index 0d5b8508869b..510cba15fa56 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -46,6 +46,7 @@ const struct inode_operations afs_file_inode_operations = { .getattr = afs_getattr, .setattr = afs_setattr, .permission = afs_permission, + .listxattr = afs_listxattr, }; const struct address_space_operations afs_fs_aops = { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index aae55dd15108..342316a9e3e0 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -28,6 +28,11 @@ struct afs_iget_data { struct afs_volume *volume; /* volume on which resides */ }; +static const struct inode_operations afs_symlink_inode_operations = { + .get_link = page_get_link, + .listxattr = afs_listxattr, +}; + /* * map the AFS file status to the inode member variables */ @@ -67,7 +72,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_fop = &afs_mntpt_file_operations; } else { inode->i_mode = S_IFLNK | vnode->status.mode; - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &afs_symlink_inode_operations; } inode_nohighmem(inode); break; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 393672997cc2..82e16556afea 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -410,6 +410,15 @@ struct afs_interface { unsigned mtu; /* MTU of interface */ }; +struct afs_uuid { + __be32 time_low; /* low part of timestamp */ + __be16 time_mid; /* mid part of timestamp */ + __be16 time_hi_and_version; /* high part of timestamp and version */ + __u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ + __u8 clock_seq_low; /* clock seq low */ + __u8 node[6]; /* spatially unique node ID (MAC addr) */ +}; + /*****************************************************************************/ /* * cache.c @@ -544,7 +553,7 @@ extern int afs_drop_inode(struct inode *); * main.c */ extern struct workqueue_struct *afs_wq; -extern struct uuid_v1 afs_uuid; +extern struct afs_uuid afs_uuid; /* * misc.c @@ -722,6 +731,11 @@ extern int afs_writeback_all(struct afs_vnode *); extern int afs_flush(struct file *, fl_owner_t); extern int afs_fsync(struct file *, loff_t, loff_t, int); +/* + * xattr.c + */ +extern const struct xattr_handler *afs_xattr_handlers[]; +extern ssize_t afs_listxattr(struct dentry *, char *, size_t); /*****************************************************************************/ /* diff --git a/fs/afs/main.c b/fs/afs/main.c index 51d7d17bca57..9944770849da 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -31,7 +31,7 @@ static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); -struct uuid_v1 afs_uuid; +struct afs_uuid afs_uuid; struct workqueue_struct *afs_wq; /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index bd3b65cde282..690fea9d84c3 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -35,6 +35,7 @@ const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, .readlink = page_readlink, .getattr = afs_getattr, + .listxattr = afs_listxattr, }; const struct inode_operations afs_autocell_inode_operations = { diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5990eb160bd..02781e78ffb6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -341,6 +341,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; size_t offset; + s64 tx_total_len; u32 abort_code; int ret; @@ -364,9 +365,20 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, srx.transport.sin.sin_port = call->port; memcpy(&srx.transport.sin.sin_addr, addr, 4); + /* Work out the length we're going to transmit. This is awkward for + * calls such as FS.StoreData where there's an extra injection of data + * after the initial fixed part. + */ + tx_total_len = call->request_size; + if (call->send_pages) { + tx_total_len += call->last_to - call->first_offset; + tx_total_len += (call->last - call->first) * PAGE_SIZE; + } + /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp, + (unsigned long)call, + tx_total_len, gfp, (async ? afs_wake_up_async_call : afs_wake_up_call_waiter)); @@ -738,6 +750,8 @@ void afs_send_empty_reply(struct afs_call *call) _enter(""); + rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0); + msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); @@ -772,6 +786,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) _enter(""); + rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len); + iov[0].iov_base = (void *) buf; iov[0].iov_len = len; msg.msg_name = NULL; diff --git a/fs/afs/security.c b/fs/afs/security.c index ecb86a670180..faca66227ecf 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -327,12 +327,11 @@ int afs_permission(struct inode *inode, int mask) if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; } else if (mask & MAY_READ) { - if (!(access & AFS_ACE_READ)) + if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; } else if (mask & MAY_WRITE) { if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ - AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */ - AFS_ACE_WRITE))) /* chmod */ + AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */ goto permission_denied; } else { BUG(); diff --git a/fs/afs/super.c b/fs/afs/super.c index c79633e5cfd8..67680c2d96cf 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -319,6 +319,7 @@ static int afs_fill_super(struct super_block *sb, sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; + sb->s_xattr = afs_xattr_handlers; ret = super_setup_bdi(sb); if (ret) return ret; diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c new file mode 100644 index 000000000000..2830e4f48d85 --- /dev/null +++ b/fs/afs/xattr.c @@ -0,0 +1,121 @@ +/* Extended attribute handling for AFS. We use xattrs to get and set metadata + * instead of providing pioctl(). + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/xattr.h> +#include "internal.h" + +static const char afs_xattr_list[] = + "afs.cell\0" + "afs.fid\0" + "afs.volume"; + +/* + * Retrieve a list of the supported xattrs. + */ +ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + if (size == 0) + return sizeof(afs_xattr_list); + if (size < sizeof(afs_xattr_list)) + return -ERANGE; + memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list)); + return sizeof(afs_xattr_list); +} + +/* + * Get the name of the cell on which a file resides. + */ +static int afs_xattr_get_cell(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_cell *cell = vnode->volume->cell; + size_t namelen; + + namelen = strlen(cell->name); + if (size == 0) + return namelen; + if (namelen > size) + return -ERANGE; + memcpy(buffer, cell->name, size); + return namelen; +} + +static const struct xattr_handler afs_xattr_afs_cell_handler = { + .name = "afs.cell", + .get = afs_xattr_get_cell, +}; + +/* + * Get the volume ID, vnode ID and vnode uniquifier of a file as a sequence of + * hex numbers separated by colons. + */ +static int afs_xattr_get_fid(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + char text[8 + 1 + 8 + 1 + 8 + 1]; + size_t len; + + len = sprintf(text, "%x:%x:%x", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + if (size == 0) + return len; + if (len > size) + return -ERANGE; + memcpy(buffer, text, len); + return len; +} + +static const struct xattr_handler afs_xattr_afs_fid_handler = { + .name = "afs.fid", + .get = afs_xattr_get_fid, +}; + +/* + * Get the name of the volume on which a file resides. + */ +static int afs_xattr_get_volume(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + const char *volname = vnode->volume->vlocation->vldb.name; + size_t namelen; + + namelen = strlen(volname); + if (size == 0) + return namelen; + if (namelen > size) + return -ERANGE; + memcpy(buffer, volname, size); + return namelen; +} + +static const struct xattr_handler afs_xattr_afs_volume_handler = { + .name = "afs.volume", + .get = afs_xattr_get_volume, +}; + +const struct xattr_handler *afs_xattr_handlers[] = { + &afs_xattr_afs_cell_handler, + &afs_xattr_afs_fid_handler, + &afs_xattr_afs_volume_handler, + NULL +}; @@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1568,6 +1568,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_pos = iocb->aio_offset; req->common.ki_complete = aio_complete; req->common.ki_flags = iocb_flags(req->common.ki_filp); + req->common.ki_hint = file_write_hint(file); if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1586,6 +1587,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_flags |= IOCB_EVENTFD; } + ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); + if (unlikely(ret)) { + pr_debug("EINVAL: aio_rw_flags\n"); + goto out_put_req; + } + + if ((req->common.ki_flags & IOCB_NOWAIT) && + !(req->common.ki_flags & IOCB_DIRECT)) { + ret = -EOPNOTSUPP; + goto out_put_req; + } + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 734cbf8d9676..dd9f1bebb5a3 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -344,7 +344,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, int status; token = (autofs_wqt_t) param->fail.token; - status = param->fail.status ? param->fail.status : -ENOENT; + status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index f2deec0a62f0..25e312cb6071 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -1,7 +1,7 @@ /* * fs/bfs/inode.c * BFS superblock and inode operations. - * Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com> * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. * * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005. @@ -19,7 +19,7 @@ #include <linux/uaccess.h> #include "bfs.h" -MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>"); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index bee1a36bc2ec..f4718098ac31 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -818,7 +818,7 @@ static const struct super_operations s_ops = { static int bm_fill_super(struct super_block *sb, void *data, int silent) { int err; - static struct tree_descr bm_files[] = { + static const struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} diff --git a/fs/block_dev.c b/fs/block_dev.c index 2a305c1a2d88..9941dc8342df 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -225,6 +225,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio_init(&bio, vecs, nr_pages); bio.bi_bdev = bdev; bio.bi_iter.bi_sector = pos >> 9; + bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; @@ -262,8 +263,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (vecs != inline_vecs) kfree(vecs); - if (unlikely(bio.bi_error)) - return bio.bi_error; + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); + + bio_uninit(&bio); + return ret; } @@ -288,16 +292,18 @@ static void blkdev_bio_end_io(struct bio *bio) bool should_dirty = dio->should_dirty; if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_error && !dio->bio.bi_error) - dio->bio.bi_error = bio->bi_error; + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; } else { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; - ssize_t ret = dio->bio.bi_error; + ssize_t ret; - if (likely(!ret)) { + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret, 0); @@ -334,7 +340,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - int ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -358,12 +364,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) for (;;) { bio->bi_bdev = bdev; bio->bi_iter.bi_sector = pos >> 9; + bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { - bio->bi_error = ret; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } @@ -412,7 +419,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); - ret = dio->bio.bi_error; + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; @@ -436,7 +444,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio)); + blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); if (!blkdev_dio_pool) return -ENOMEM; return 0; @@ -624,7 +632,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct block_device *bdev = I_BDEV(bd_inode); int error; - error = filemap_write_and_wait_range(filp->f_mapping, start, end); + error = file_write_and_wait_range(filp, start, end); if (error) return error; @@ -717,72 +725,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(bdev_write_page); -int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, - pgoff_t *pgoff) -{ - phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; - - if (pgoff) - *pgoff = PHYS_PFN(phys_off); - if (phys_off % PAGE_SIZE || size % PAGE_SIZE) - return -EINVAL; - return 0; -} -EXPORT_SYMBOL(bdev_dax_pgoff); - -/** - * bdev_dax_supported() - Check if the device supports dax for filesystem - * @sb: The superblock of the device - * @blocksize: The block size of the device - * - * This is a library function for filesystems to check if the block device - * can be mounted with dax option. - * - * Return: negative errno if unsupported, 0 if supported. - */ -int bdev_dax_supported(struct super_block *sb, int blocksize) -{ - struct block_device *bdev = sb->s_bdev; - struct dax_device *dax_dev; - pgoff_t pgoff; - int err, id; - void *kaddr; - pfn_t pfn; - long len; - - if (blocksize != PAGE_SIZE) { - vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); - return -EINVAL; - } - - err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); - if (err) { - vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax"); - return err; - } - - dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); - if (!dax_dev) { - vfs_msg(sb, KERN_ERR, "error: device does not support dax"); - return -EOPNOTSUPP; - } - - id = dax_read_lock(); - len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); - dax_read_unlock(id); - - put_dax(dax_dev); - - if (len < 1) { - vfs_msg(sb, KERN_ERR, - "error: dax access failed (%ld)", len); - return len < 0 ? len : -EIO; - } - - return 0; -} -EXPORT_SYMBOL_GPL(bdev_dax_supported); - /* * pseudo-fs */ @@ -1809,6 +1751,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) return -ENOMEM; filp->f_mapping = bdev->bd_inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return blkdev_get(bdev, filp->f_mode, filp); } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 247b8dfaf6e5..8d8370ddb6b2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -78,12 +78,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (ret) - return ret; - } - ret = 0; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) @@ -119,6 +113,13 @@ out: int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + int ret; + + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) + return ret; + } return __btrfs_set_acl(NULL, inode, acl, type); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7699e16784d3..f723c11bb763 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,7 +16,7 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/rbtree.h> #include "ctree.h" #include "disk-io.h" @@ -26,6 +26,11 @@ #include "delayed-ref.h" #include "locking.h" +enum merge_mode { + MERGE_IDENTICAL_KEYS = 1, + MERGE_IDENTICAL_PARENTS, +}; + /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -533,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * slot==nritems. In that case, go to the next leaf before we continue. */ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -577,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -629,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == (u64)-1) + else if (time_seq == SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -640,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 0, 0); else @@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, /* * merge backrefs and adjust counts accordingly * - * mode = 1: merge identical keys, if key is set - * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. - * additionally, we could even add a key range for the blocks we - * looked into to merge even more (-> replace unresolved refs by those - * having a parent). - * mode = 2: merge identical parents + * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref + * then we can merge more here. Additionally, we could even add a key + * range for the blocks we looked into to merge even more (-> replace + * unresolved refs by those having a parent). */ -static void __merge_refs(struct list_head *head, int mode) +static void __merge_refs(struct list_head *head, enum merge_mode mode) { struct __prelim_ref *pos1; @@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode) if (!ref_for_same_block(ref1, ref2)) continue; - if (mode == 1) { + if (mode == MERGE_IDENTICAL_KEYS) { if (!ref1->parent && ref2->parent) swap(ref1, ref2); } else { @@ -1196,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * - * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave * much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). @@ -1243,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) path->skip_locking = 1; /* @@ -1273,9 +1276,9 @@ again: #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != (u64)-1) { + time_seq != SEQ_LAST) { #else - if (trans && time_seq != (u64)-1) { + if (trans && time_seq != SEQ_LAST) { #endif /* * look if there are updates for this ref queued and lock the @@ -1286,7 +1289,7 @@ again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1374,7 +1377,7 @@ again: if (ret) goto out; - __merge_refs(&prefs, 1); + __merge_refs(&prefs, MERGE_IDENTICAL_KEYS); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, extent_item_pos, total_refs, @@ -1382,7 +1385,7 @@ again: if (ret) goto out; - __merge_refs(&prefs, 2); + __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS); while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); @@ -2302,7 +2305,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = vmalloc(alloc_bytes); + data = kvmalloc(alloc_bytes, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); @@ -2336,9 +2339,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, if (IS_ERR(fspath)) return (void *)fspath; - ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + ifp = kmalloc(sizeof(*ifp), GFP_KERNEL); if (!ifp) { - vfree(fspath); + kvfree(fspath); return ERR_PTR(-ENOMEM); } @@ -2353,6 +2356,6 @@ void free_ipath(struct inode_fs_paths *ipath) { if (!ipath) return; - vfree(ipath->fspath); + kvfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0c6baaba0651..d87ac27a5f2b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -125,6 +125,13 @@ struct btrfs_inode { u64 delalloc_bytes; /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes. + */ + u64 new_delalloc_bytes; + + /* * total number of bytes pending defrag, used by stat to check whether * it needs COW. */ @@ -303,7 +310,8 @@ struct btrfs_dio_private { * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); + blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, + blk_status_t); }; /* diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ab14c2e635ca..11d37c94ce05 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -94,7 +94,7 @@ #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/string.h> #include "ctree.h" #include "disk-io.h" @@ -1638,12 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, struct bio *bio; unsigned int j; - bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); - if (!bio) { - pr_info("btrfsic: bio_alloc() for %u pages failed!\n", - num_pages - i); - return -1; - } + bio = btrfs_io_bio_alloc(num_pages - i); bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -1668,14 +1663,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, dev_bytenr += (j - i) * PAGE_SIZE; i = j; } - for (i = 0; i < num_pages; i++) { + for (i = 0; i < num_pages; i++) block_ctx->datav[i] = kmap(block_ctx->pagev[i]); - if (!block_ctx->datav[i]) { - pr_info("btrfsic: kmap() failed (dev %s)!\n", - block_ctx->dev->name); - return -1; - } - } return block_ctx->len; } @@ -2129,7 +2118,7 @@ static void btrfsic_bio_end_io(struct bio *bp) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bp->bi_error) + if (bp->bi_status) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2143,7 +2132,7 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bp->bi_error, + bp->bi_status, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2822,44 +2811,47 @@ static void __btrfsic_submit_bio(struct bio *bio) dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - unsigned int i; + unsigned int i = 0; u64 dev_bytenr; u64 cur_bytenr; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; int bio_is_patched; char **mapped_datav; + unsigned int segs = bio_segments(bio); dev_bytenr = 512 * bio->bi_iter.bi_sector; bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_vcnt, + bio_op(bio), bio->bi_opf, segs, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc_array(bio->bi_vcnt, + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; - bio_for_each_segment_all(bvec, bio, i) { - BUG_ON(bvec->bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = kmap(bvec.bv_page); + i++; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec->bv_len, bvec->bv_offset); - cur_bytenr += bvec->bv_len; + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, bio->bi_vcnt, + mapped_datav, segs, bio, &bio_is_patched, NULL, bio->bi_opf); - bio_for_each_segment_all(bvec, bio, i) - kunmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) + kunmap(bvec.bv_page); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & @@ -2923,13 +2915,10 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, fs_info->sectorsize, PAGE_SIZE); return -1; } - state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + state = kvzalloc(sizeof(*state), GFP_KERNEL); if (!state) { - state = vzalloc(sizeof(*state)); - if (!state) { - pr_info("btrfs check-integrity: vzalloc() failed!\n"); - return -1; - } + pr_info("btrfs check-integrity: allocation failed!\n"); + return -1; } if (!btrfsic_is_initialized) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c7721a6aa3bb..2c0b7b57fcd5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/bit_spinlock.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -42,48 +43,7 @@ #include "extent_io.h" #include "extent_map.h" -struct compressed_bio { - /* number of bios pending for this compressed extent */ - atomic_t pending_bios; - - /* the pages with the compressed data on them */ - struct page **compressed_pages; - - /* inode that owns this data */ - struct inode *inode; - - /* starting offset in the inode for our pages */ - u64 start; - - /* number of bytes in the inode we're working on */ - unsigned long len; - - /* number of bytes on disk */ - unsigned long compressed_len; - - /* the compression algorithm for this bio */ - int compress_type; - - /* number of compressed pages in the array */ - unsigned long nr_pages; - - /* IO errors */ - int errors; - int mirror_num; - - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u32 sums; -}; - -static int btrfs_decompress_bio(int type, struct page **pages_in, - u64 disk_start, struct bio *orig_bio, - size_t srclen); +static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) @@ -94,12 +54,6 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; } -static struct bio *compressed_bio_alloc(struct block_device *bdev, - u64 first_byte, gfp_t gfp_flags) -{ - return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags); -} - static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) @@ -155,13 +109,13 @@ static void end_compressed_bio_read(struct bio *bio) unsigned long index; int ret; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; inode = cb->inode; @@ -173,11 +127,8 @@ static void end_compressed_bio_read(struct bio *bio) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_decompress_bio(cb->compress_type, - cb->compressed_pages, - cb->start, - cb->orig_bio, - cb->compressed_len); + ret = btrfs_decompress_bio(cb); + csum_failed: if (ret) cb->errors = 1; @@ -268,13 +219,13 @@ static void end_compressed_bio_write(struct bio *bio) struct page *page; unsigned long index; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; /* ok, we're the last bio for this extent, step one is to @@ -287,7 +238,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_error ? 0 : 1); + bio->bi_status ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -320,7 +271,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -335,14 +286,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; struct block_device *bdev; - int ret; + blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) - return -ENOMEM; - atomic_set(&cb->pending_bios, 0); + return BLK_STS_RESOURCE; + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->start = start; @@ -355,30 +306,26 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - if (!bio) { - kfree(cb); - return -ENOMEM; - } + bio = btrfs_bio_alloc(bdev, first_byte); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); /* create and submit bios for the compressed pages */ bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + int submit = 0; + page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + submit = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(bio); @@ -388,7 +335,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -400,14 +347,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } bio_put(bio); - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - BUG_ON(!bio); + bio = btrfs_bio_alloc(bdev, first_byte); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -434,7 +380,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -569,7 +515,7 @@ next: * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -586,7 +532,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret = -ENOMEM; + blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; u32 *sums; @@ -600,14 +546,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) - return -EIO; + return BLK_STS_IOERR; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) goto out; - atomic_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; @@ -638,7 +584,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, __GFP_HIGHMEM); if (!cb->compressed_pages[pg_index]) { faili = pg_index - 1; - ret = -ENOMEM; + ret = BLK_STS_RESOURCE; goto fail2; } } @@ -650,28 +596,26 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); - if (!comp_bio) - goto fail2; + comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { + int submit = 0; + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + submit = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(comp_bio); @@ -685,7 +629,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, @@ -697,15 +641,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } bio_put(comp_bio); - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, - GFP_NOFS); - BUG_ON(!comp_bio); + comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -726,7 +668,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } @@ -801,6 +743,7 @@ static struct list_head *find_workspace(int type) struct list_head *workspace; int cpus = num_online_cpus(); int idx = type - 1; + unsigned nofs_flag; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; @@ -830,7 +773,15 @@ again: atomic_inc(total_ws); spin_unlock(ws_lock); + /* + * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have + * to turn it off here because we might get called from the restricted + * context of btrfs_compress_bio/btrfs_compress_pages + */ + nofs_flag = memalloc_nofs_save(); workspace = btrfs_compress_op[idx]->alloc_workspace(); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(workspace)) { atomic_dec(total_ws); wake_up(ws_wait); @@ -961,19 +912,16 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * be contiguous. They all correspond to the range of bytes covered by * the compressed extent. */ -static int btrfs_decompress_bio(int type, struct page **pages_in, - u64 disk_start, struct bio *orig_bio, - size_t srclen) +static int btrfs_decompress_bio(struct compressed_bio *cb) { struct list_head *workspace; int ret; + int type = cb->compress_type; workspace = find_workspace(type); - - ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in, - disk_start, orig_bio, - srclen); + ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb); free_workspace(type, workspace); + return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 39ec43ab8df1..87f6d3332163 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -34,6 +34,45 @@ /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) +struct compressed_bio { + /* number of bios pending for this compressed extent */ + refcount_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* the compression algorithm for this bio */ + int compress_type; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + void btrfs_init_compress(void); void btrfs_exit_compress(void); @@ -48,12 +87,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages); -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); enum btrfs_compression_type { @@ -78,10 +117,7 @@ struct btrfs_compress_op { unsigned long *total_out); int (*decompress_bio)(struct list_head *workspace, - struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen); + struct compressed_bio *cb); int (*decompress)(struct list_head *workspace, unsigned char *data_in, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7dc8844037e0..3f4daa9d6e2c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,7 +19,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/rbtree.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -567,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, - int nr_items, gfp_t flags) + int nr_items) { struct tree_mod_elem *tm = NULL; struct tree_mod_elem **tm_list = NULL; @@ -578,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -596,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -663,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, - struct extent_buffer *new_root, gfp_t flags, + struct extent_buffer *new_root, int log_removal) { struct tree_mod_elem *tm = NULL; @@ -678,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), - flags); + GFP_NOFS); if (!tm_list) { ret = -ENOMEM; goto free_tms; } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -693,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, } } - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -873,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, { int ret; ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, - nr_items, GFP_NOFS); + nr_items); BUG_ON(ret < 0); } @@ -943,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root, { int ret; ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, GFP_NOFS, log_removal); + new_root_node, log_removal); BUG_ON(ret < 0); } @@ -3667,14 +3667,14 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, /* make room in the right data area */ data_end = leaf_data_end(fs_info, right); memmove_extent_buffer(right, - btrfs_leaf_data(right) + data_end - push_space, - btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_OFFSET + data_end - push_space, + BTRFS_LEAF_DATA_OFFSET + data_end, BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, btrfs_leaf_data(right) + + copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - btrfs_leaf_data(left) + leaf_data_end(fs_info, left), + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), @@ -3888,9 +3888,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset_nr(right, push_items - 1); - copy_extent_buffer(left, right, btrfs_leaf_data(left) + + copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left) - push_space, - btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset_nr(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); @@ -3917,9 +3917,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - leaf_data_end(fs_info, right); - memmove_extent_buffer(right, btrfs_leaf_data(right) + + memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, right), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(0), @@ -4069,8 +4069,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems * sizeof(struct btrfs_item)); copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) - - data_copy_size, btrfs_leaf_data(l) + + BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - + data_copy_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); @@ -4607,8 +4607,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + data_end, old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; @@ -4634,8 +4634,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, } } - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + data_end, old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); @@ -4707,8 +4707,8 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - data_size, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end - data_size, BTRFS_LEAF_DATA_OFFSET + data_end, old_data - data_end); data_end = old_data; @@ -4790,8 +4790,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end - total_data, BTRFS_LEAF_DATA_OFFSET + data_end, old_data - data_end); data_end = old_data; } @@ -4983,9 +4983,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { int data_end = leaf_data_end(fs_info, leaf); - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, - btrfs_leaf_data(leaf) + data_end, + BTRFS_LEAF_DATA_OFFSET + data_end, last_off - data_end); for (i = slot + nr; i < nritems; i++) { @@ -5392,13 +5392,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); if (!tmp_buf) { - tmp_buf = vmalloc(fs_info->nodesize); - if (!tmp_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } left_path->search_commit_root = 1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3e21211e99c3..3f3eb7b17cac 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -39,6 +39,7 @@ #include <linux/security.h> #include <linux/sizes.h> #include <linux/dynamic_debug.h> +#include <linux/refcount.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -47,7 +48,6 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; @@ -518,7 +518,7 @@ struct btrfs_caching_control { struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; - atomic_t count; + refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ @@ -538,6 +538,14 @@ struct btrfs_io_ctl { unsigned check_crcs:1; }; +/* + * Tree to record all locked full stripes of a RAID5/6 block group + */ +struct btrfs_full_stripe_locks_tree { + struct rb_root root; + struct mutex lock; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -648,6 +656,9 @@ struct btrfs_block_group_cache { * Protected by free_space_lock. */ int needs_free_space; + + /* Record locked full stripes for RAID5/6 block group */ + struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; /* delayed seq elem */ @@ -658,6 +669,8 @@ struct seq_list { #define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define SEQ_LAST ((u64)-1) + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -702,6 +715,15 @@ struct btrfs_delayed_root; #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 +#define BTRFS_FS_QUOTA_OVERRIDE 14 +/* Used to record internally whether fs has been frozen */ +#define BTRFS_FS_FROZEN 15 + +/* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ +#define BTRFS_FS_EXCL_OP 14 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -729,8 +751,7 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; /* keep track of unallocated space */ - spinlock_t free_chunk_lock; - u64 free_chunk_space; + atomic64_t free_chunk_space; struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; @@ -778,17 +799,7 @@ struct btrfs_fs_info { * so it is also safe. */ u64 max_inline; - /* - * Protected by ->chunk_mutex and sb->s_umount. - * - * The reason that we use two lock to protect it is because only - * remount and mount operations can change it and these two operations - * are under sb->s_umount, but the read side (chunk allocation) can not - * acquire sb->s_umount or the deadlock would happen. So we use two - * locks to protect it. On the write side, we must acquire two locks, - * and on the read side, we just need acquire one of them. - */ - u64 alloc_start; + struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; @@ -1066,8 +1077,6 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; - atomic_t mutually_exclusive_operation_running; - struct percpu_counter bio_counter; wait_queue_head_t replace_wait; @@ -1090,9 +1099,6 @@ struct btrfs_fs_info { */ struct list_head pinned_chunks; - /* Used to record internally whether fs has been frozen */ - int fs_frozen; - /* Cached block sizes */ u32 nodesize; u32 sectorsize; @@ -1220,7 +1226,7 @@ struct btrfs_root { dev_t anon_dev; spinlock_t root_item_lock; - atomic_t refs; + refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; @@ -1260,21 +1266,20 @@ struct btrfs_root { /* For qgroup metadata space reserve */ atomic64_t qgroup_meta_rsv; }; + static inline u32 btrfs_inode_sectorsize(const struct inode *inode) { return btrfs_sb(inode->i_sb)->sectorsize; } -static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) -{ - return blocksize - sizeof(struct btrfs_header); -} - static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return __BTRFS_LEAF_DATA_SIZE(info->nodesize); + + return info->nodesize - sizeof(struct btrfs_header); } +#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) + static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); @@ -1536,8 +1541,27 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ s->member = cpu_to_le##bits(val); \ } + +static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + BUILD_BUG_ON(sizeof(u64) != + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); +} +static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + BUILD_BUG_ON(sizeof(u64) != + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + + BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); @@ -2307,10 +2331,6 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s) return btrfs_csum_sizes[t]; } -static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) -{ - return offsetof(struct btrfs_leaf, items); -} /* * The leaf data grows from end-to-front in the node. @@ -2521,11 +2541,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(btrfs_leaf_data(leaf) + \ + ((type *)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(btrfs_leaf_data(leaf) + \ + ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) @@ -2546,7 +2566,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* @@ -2556,7 +2576,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, @@ -2663,7 +2683,9 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2686,9 +2708,13 @@ enum btrfs_flush_state { COMMIT_TRANS = 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2705,8 +2731,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); @@ -3014,12 +3040,14 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, const char *name, u16 name_len, int mod); int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, + struct extent_buffer *leaf, int slot, struct btrfs_dir_item *dir_item); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); +bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, + unsigned long start, u16 name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3061,8 +3089,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3077,7 +3105,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); @@ -3154,6 +3182,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); +void btrfs_set_range_writeback(void *private_data, u64 start, u64 end); int btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3646,6 +3675,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); +static inline void btrfs_init_full_stripe_locks_tree( + struct btrfs_full_stripe_locks_tree *locks_root) +{ + locks_root->root = RB_ROOT; + mutex_init(&locks_root->lock); +} /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); @@ -3670,8 +3705,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err); +int btree_readahead_hook(struct extent_buffer *eb, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1aff676f0e5b..8ae409b5a61d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node( { delayed_node->root = root; delayed_node->inode_id = inode_id; - atomic_set(&delayed_node->refs, 0); + refcount_set(&delayed_node->refs, 0); delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); @@ -81,7 +81,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = READ_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); + refcount_inc(&node->refs); return node; } @@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { - atomic_inc(&node->refs); /* can be accessed */ + refcount_inc(&node->refs); /* can be accessed */ BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); return node; } btrfs_inode->delayed_node = node; /* can be accessed and cached in the inode */ - atomic_add(2, &node->refs); + refcount_add(2, &node->refs); spin_unlock(&root->inode_lock); return node; } @@ -125,7 +125,7 @@ again: btrfs_init_delayed_node(node, root, ino); /* cached in the btrfs inode and can be accessed */ - atomic_add(2, &node->refs); + refcount_set(&node->refs, 2); ret = radix_tree_preload(GFP_NOFS); if (ret) { @@ -166,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, } else { list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); - atomic_inc(&node->refs); /* inserted into list */ + refcount_inc(&node->refs); /* inserted into list */ root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -180,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; - atomic_dec(&node->refs); /* not in the list */ + refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) list_del_init(&node->p_list); @@ -201,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node( p = delayed_root->node_list.next; node = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -228,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( p = node->n_list.next; next = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); out: spin_unlock(&delayed_root->lock); @@ -253,11 +253,11 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); - if (atomic_dec_and_test(&delayed_node->refs)) { + if (refcount_dec_and_test(&delayed_node->refs)) { bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); - if (atomic_read(&delayed_node->refs) == 0) { + if (refcount_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); free = true; @@ -286,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( p = delayed_root->prepare_list.next; list_del_init(p); node = list_entry(p, struct btrfs_delayed_node, p_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -308,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->ins_or_del = 0; item->bytes_reserved = 0; item->delayed_node = NULL; - atomic_set(&item->refs, 1); + refcount_set(&item->refs, 1); } return item; } @@ -483,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) { if (item) { __btrfs_remove_delayed_item(item); - if (atomic_dec_and_test(&item->refs)) + if (refcount_dec_and_test(&item->refs)) kfree(item); } } @@ -1600,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, ins_list); item = __btrfs_next_delayed_item(item); } item = __btrfs_first_delayed_deletion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, del_list); item = __btrfs_next_delayed_item(item); } @@ -1621,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ - atomic_dec(&delayed_node->refs); + refcount_dec(&delayed_node->refs); return true; } @@ -1634,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } list_for_each_entry_safe(curr, next, del_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } @@ -1667,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, list_del(&curr->readdir_list); ret = (curr->key.offset == index); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (ret) @@ -1705,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, list_del(&curr->readdir_list); if (curr->key.offset < ctx->pos) { - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); continue; } @@ -1722,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, over = !dir_emit(ctx, name, name_len, location.objectid, d_type); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (over) @@ -1963,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) inode_id = delayed_nodes[n - 1]->inode_id + 1; for (i = 0; i < n; i++) - atomic_inc(&delayed_nodes[i]->refs); + refcount_inc(&delayed_nodes[i]->refs); spin_unlock(&root->inode_lock); for (i = 0; i < n; i++) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 40327cc3b99a..c4189d495934 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -26,7 +26,7 @@ #include <linux/list.h> #include <linux/wait.h> #include <linux/atomic.h> - +#include <linux/refcount.h> #include "ctree.h" /* types of the delayed item */ @@ -67,7 +67,7 @@ struct btrfs_delayed_node { struct rb_root del_root; struct mutex mutex; struct btrfs_inode_item inode_item; - atomic_t refs; + refcount_t refs; u64 index_cnt; unsigned long flags; int count; @@ -80,7 +80,7 @@ struct btrfs_delayed_item { struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; struct btrfs_delayed_node *delayed_node; - atomic_t refs; + refcount_t refs; int ins_or_del; u32 data_len; char data[0]; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6eb80952efb3..93ffa898df6d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -470,7 +470,8 @@ add_tail: static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) + struct btrfs_delayed_ref_node *update, + int *old_ref_mod_ret) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; @@ -523,6 +524,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing_ref->total_ref_mod; + if (old_ref_mod_ret) + *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing_ref->total_ref_mod += update->ref_mod; @@ -550,7 +553,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int *qrecord_inserted_ret) + int action, int is_data, int *qrecord_inserted_ret, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; @@ -590,7 +594,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = count_mod; @@ -638,7 +642,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref, + old_ref_mod); /* * we've updated the existing ref, free the newly * allocated ref @@ -646,6 +651,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (old_ref_mod) + *old_ref_mod = 0; if (is_data && count_mod < 0) delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; @@ -655,6 +662,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; + if (new_ref_mod) + *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -682,7 +691,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; @@ -739,7 +748,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, seq = atomic64_read(&fs_info->tree_mod_seq); /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; @@ -778,7 +787,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -813,7 +823,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, 0, 0, action, 0, - &qrecord_inserted); + &qrecord_inserted, old_ref_mod, + new_ref_mod); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action); @@ -838,7 +849,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action) + u64 owner, u64 offset, u64 reserved, int action, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -878,7 +890,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, ref_root, reserved, - action, 1, &qrecord_inserted); + action, 1, &qrecord_inserted, + old_ref_mod, new_ref_mod); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, @@ -909,7 +922,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, NULL); + extent_op->is_data, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 0e537f98f1a1..ce88e4ac5276 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,6 +18,8 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ +#include <linux/refcount.h> + /* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ @@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node { u64 seq; /* ref count on this data structure */ - atomic_t refs; + refcount_t refs; /* * how many refs is this entry adding or deleting. For @@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { - WARN_ON(atomic_read(&ref->refs) == 0); - if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(refcount_read(&ref->refs) == 0); + if (refcount_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); switch (ref->type) { case BTRFS_TREE_BLOCK_REF_KEY: @@ -245,12 +247,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action); + u64 owner, u64 offset, u64 reserved, int action, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e653921f05d9..bee3edeea7a3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -388,7 +388,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_rm_dev_replace_unblocked(fs_info); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return scrub_ret; @@ -665,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div_u64(dev_replace->cursor_left, + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } @@ -784,8 +786,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) } btrfs_dev_replace_unlock(dev_replace, 1); - WARN_ON(atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); return PTR_ERR_OR_ZERO(task); } @@ -814,7 +815,7 @@ static int btrfs_dev_replace_kthread(void *data) (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; } diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 60a750678a82..41cb9196eaa8 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -395,8 +395,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, dir_item)) - return NULL; total_len = btrfs_item_size_nr(leaf, path->slots[0]); while (cur < total_len) { @@ -405,6 +403,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); + if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item)) + return NULL; if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; @@ -453,9 +453,11 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, + int slot, struct btrfs_dir_item *dir_item) { u16 namelen = BTRFS_NAME_LEN; + int ret; u8 type = btrfs_dir_type(leaf, dir_item); if (type >= BTRFS_FT_MAX) { @@ -468,10 +470,16 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, if (btrfs_dir_name_len(leaf, dir_item) > namelen) { btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned)btrfs_dir_data_len(leaf, dir_item)); + (unsigned)btrfs_dir_name_len(leaf, dir_item)); return 1; } + namelen = btrfs_dir_name_len(leaf, dir_item); + ret = btrfs_is_name_len_valid(leaf, slot, + (unsigned long)(dir_item + 1), namelen); + if (!ret) + return 1; + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ if ((btrfs_dir_data_len(leaf, dir_item) + btrfs_dir_name_len(leaf, dir_item)) > @@ -484,3 +492,67 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, return 0; } + +bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, + unsigned long start, u16 name_len) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_key key; + u32 read_start; + u32 read_end; + u32 item_start; + u32 item_end; + u32 size; + bool ret = true; + + ASSERT(start > BTRFS_LEAF_DATA_OFFSET); + + read_start = start - BTRFS_LEAF_DATA_OFFSET; + read_end = read_start + name_len; + item_start = btrfs_item_offset_nr(leaf, slot); + item_end = btrfs_item_end_nr(leaf, slot); + + btrfs_item_key_to_cpu(leaf, &key, slot); + + switch (key.type) { + case BTRFS_DIR_ITEM_KEY: + case BTRFS_XATTR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + size = sizeof(struct btrfs_dir_item); + break; + case BTRFS_INODE_REF_KEY: + size = sizeof(struct btrfs_inode_ref); + break; + case BTRFS_INODE_EXTREF_KEY: + size = sizeof(struct btrfs_inode_extref); + break; + case BTRFS_ROOT_REF_KEY: + case BTRFS_ROOT_BACKREF_KEY: + size = sizeof(struct btrfs_root_ref); + break; + default: + ret = false; + goto out; + } + + if (read_start < item_start) { + ret = false; + goto out; + } + if (read_end > item_end) { + ret = false; + goto out; + } + + /* there shall be item(s) before name */ + if (read_start - item_start < size) { + ret = false; + goto out; + } + +out: + if (!ret) + btrfs_crit(fs_info, "invalid dir item name len: %u", + (unsigned int)name_len); + return ret; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 061c1d1f774f..086dcbadce09 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -87,9 +87,8 @@ struct btrfs_end_io_wq { bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; - int error; + blk_status_t status; enum btrfs_wq_endio_type metadata; - struct list_head list; struct btrfs_work work; }; @@ -118,9 +117,9 @@ void btrfs_end_io_wq_exit(void) * just before they are sent down the IO stack. */ struct async_submit_bio { - struct inode *inode; + void *private_data; + struct btrfs_fs_info *fs_info; struct bio *bio; - struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; int mirror_num; @@ -131,7 +130,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; - int error; + blk_status_t status; }; /* @@ -762,7 +761,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(fs_info, eb, ret); + btree_readahead_hook(eb, ret); if (ret) { /* @@ -787,7 +786,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb->fs_info, eb, -EIO); + btree_readahead_hook(eb, -EIO); return -EIO; /* we fixed nothing */ } @@ -799,7 +798,7 @@ static void end_workqueue_bio(struct bio *bio) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = bio->bi_error; + end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -836,19 +835,19 @@ static void end_workqueue_bio(struct bio *bio) btrfs_queue_work(wq, &end_io_wq->work); } -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { struct btrfs_end_io_wq *end_io_wq; end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) - return -ENOMEM; + return BLK_STS_RESOURCE; end_io_wq->private = bio->bi_private; end_io_wq->end_io = bio->bi_end_io; end_io_wq->info = info; - end_io_wq->error = 0; + end_io_wq->status = 0; end_io_wq->bio = bio; end_io_wq->metadata = metadata; @@ -868,14 +867,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; - int ret; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, + ret = async->submit_bio_start(async->private_data, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) - async->error = ret; + async->status = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -885,7 +884,7 @@ static void run_one_async_done(struct btrfs_work *work) int limit; async = container_of(work, struct async_submit_bio, work); - fs_info = BTRFS_I(async->inode)->root->fs_info; + fs_info = async->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -898,13 +897,13 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ - if (async->error) { - async->bio->bi_error = async->error; + if (async->status) { + async->bio->bi_status = async->status; bio_endio(async->bio); return; } - async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->submit_bio_done(async->private_data, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); } @@ -916,20 +915,20 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, void *private_data, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) { struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return -ENOMEM; + return BLK_STS_RESOURCE; - async->inode = inode; + async->private_data = private_data; + async->fs_info = fs_info; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; - async->error = 0; + async->status = 0; atomic_inc(&fs_info->nr_async_submits); @@ -959,7 +958,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return 0; } -static int btree_csum_one_bio(struct bio *bio) +static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; @@ -972,12 +971,12 @@ static int btree_csum_one_bio(struct bio *bio) break; } - return ret; + return errno_to_blk_status(ret); } -static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -986,11 +985,12 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret; + struct inode *inode = private_data; + blk_status_t ret; /* * when we're called for a write, we're already in the async @@ -998,7 +998,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, */ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1015,13 +1015,14 @@ static int check_async_write(unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(bio_flags); - int ret; + blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { /* @@ -1043,8 +1044,8 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ - ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0, - bio_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, + bio_offset, private_data, __btree_submit_bio_start, __btree_submit_bio_done); } @@ -1054,7 +1055,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, return 0; out_w_error: - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); return ret; } @@ -1222,10 +1223,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf) buf->start + buf->len - 1); } -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); + filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -1255,9 +1256,9 @@ void clean_tree_block(struct btrfs_fs_info *fs_info, btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -1340,15 +1341,14 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - atomic_set(&root->refs, 1); + refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; if (!dummy) - extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&root->dirty_log_pages, NULL); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -1820,7 +1820,7 @@ static void end_workqueue_fn(struct btrfs_work *work) end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - bio->bi_error = end_io_wq->error; + bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); @@ -2309,7 +2309,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) inode->i_mapping->a_ops = &btree_aops; RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode); BTRFS_I(inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); @@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); @@ -2662,12 +2661,11 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); - fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->free_chunk_space = 0; + atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ @@ -2704,10 +2702,8 @@ int open_ctree(struct super_block *sb, fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(&fs_info->freed_extents[0], - fs_info->btree_inode->i_mapping); - extent_io_tree_init(&fs_info->freed_extents[1], - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&fs_info->freed_extents[0], NULL); + extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_BARRIER, &fs_info->flags); @@ -3467,10 +3463,12 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - if (i == 0) - ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh); - else + if (i == 0) { + ret = btrfsic_submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_FUA, bh); + } else { ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + } if (ret) errors++; } @@ -3483,64 +3481,61 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio) { - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); + complete(bio->bi_private); } /* - * trigger flushes for one the devices. If you pass wait == 0, the flushes are - * sent down. With wait == 1, it waits for the previous flush. - * - * any device where the flush fails with eopnotsupp are flagged as not-barrier - * capable + * Submit a flush request to the device if it supports it. Error handling is + * done in the waiting counterpart. */ -static int write_dev_flush(struct btrfs_device *device, int wait) +static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio; - int ret = 0; + struct request_queue *q = bdev_get_queue(device->bdev); + struct bio *bio = device->flush_bio; - if (device->nobarriers) - return 0; + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return; - if (wait) { - bio = device->flush_bio; - if (!bio) - return 0; + bio_reset(bio); + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; - wait_for_completion(&device->flush_wait); + submit_bio(bio); + device->flush_bio_sent = 1; +} - if (bio->bi_error) { - ret = bio->bi_error; - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_FLUSH_ERRS); - } +/* + * If the flush bio has been submitted by write_dev_flush, wait for it. + */ +static blk_status_t wait_dev_flush(struct btrfs_device *device) +{ + struct bio *bio = device->flush_bio; - /* drop the reference from the wait == 0 run */ - bio_put(bio); - device->flush_bio = NULL; + if (!device->flush_bio_sent) + return 0; - return ret; - } + device->flush_bio_sent = 0; + wait_for_completion_io(&device->flush_wait); - /* - * one reference for us, and we leave it for the - * caller - */ - device->flush_bio = NULL; - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - return -ENOMEM; + return bio->bi_status; +} - bio->bi_end_io = btrfs_end_empty_barrier; - bio->bi_bdev = device->bdev; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - init_completion(&device->flush_wait); - bio->bi_private = &device->flush_wait; - device->flush_bio = bio; +static int check_barrier_error(struct btrfs_fs_devices *fsdevs) +{ + int dev_flush_error = 0; + struct btrfs_device *dev; + + list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) { + if (!dev->bdev || dev->last_flush_error) + dev_flush_error++; + } - bio_get(bio); - btrfsic_submit_bio(bio); + if (dev_flush_error > + fsdevs->fs_info->num_tolerated_disk_barrier_failures) + return -EIO; return 0; } @@ -3553,25 +3548,21 @@ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; - int errors_send = 0; int errors_wait = 0; - int ret; + blk_status_t ret; /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { if (dev->missing) continue; - if (!dev->bdev) { - errors_send++; + if (!dev->bdev) continue; - } if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_flush(dev, 0); - if (ret) - errors_send++; + write_dev_flush(dev); + dev->last_flush_error = 0; } /* wait for all the barriers */ @@ -3585,13 +3576,23 @@ static int barrier_all_devices(struct btrfs_fs_info *info) if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_flush(dev, 1); - if (ret) + ret = wait_dev_flush(dev); + if (ret) { + dev->last_flush_error = ret; + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); errors_wait++; + } + } + + if (errors_wait) { + /* + * At some point we need the status of all disks + * to arrive at the volume status. So error checking + * is being pushed to a separate loop. + */ + return check_barrier_error(info->fs_devices); } - if (errors_send > info->num_tolerated_disk_barrier_failures || - errors_wait > info->num_tolerated_disk_barrier_failures) - return -EIO; return 0; } @@ -4046,9 +4047,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->start, transid, fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - buf->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + buf->len, + fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { btrfs_print_leaf(fs_info, buf); @@ -4321,7 +4322,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -4575,11 +4576,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - - /* - memset(cur_trans, 0, sizeof(*cur_trans)); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); - */ } static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) @@ -4593,7 +4589,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); if (t->state >= TRANS_STATE_COMMIT_START) { - atomic_inc(&t->use_count); + refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); btrfs_put_transaction(t); @@ -4635,6 +4631,12 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } +static struct btrfs_fs_info *btree_fs_info(void *private_data) +{ + struct inode *inode = private_data; + return btrfs_sb(inode->i_sb); +} + static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, @@ -4642,6 +4644,8 @@ static const struct extent_io_ops btree_extent_io_ops = { /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_failed_hook = btree_io_failed_hook, + .set_range_writeback = btrfs_set_range_writeback, + .tree_fs_info = btree_fs_info, /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2e0ec29bfd69..0a634d3ffc16 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -101,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); */ static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) { - if (atomic_inc_not_zero(&root->refs)) + if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } static inline void btrfs_put_fs_root(struct btrfs_root *root) { - if (atomic_dec_and_test(&root->refs)) + if (refcount_dec_and_test(&root->refs)) kfree(root); } @@ -118,16 +118,16 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset, +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, void *private_data, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +void btrfs_wait_tree_block_writeback(struct extent_buffer *buf); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 87144c9f9593..fa66980726c9 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -282,6 +282,11 @@ static int btrfs_get_name(struct dentry *parent, char *name, name_len = btrfs_inode_ref_name_len(leaf, iref); } + ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len); + if (!ret) { + btrfs_free_path(path); + return -EIO; + } read_extent_buffer(leaf, name, name_ptr, name_len); btrfs_free_path(path); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index be5477676cc8..375f8c728d91 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -97,10 +97,11 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush); + enum btrfs_reserve_flush_enum flush, + bool system_chunk); static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); @@ -131,6 +132,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); + + /* + * If not empty, someone is still holding mutex of + * full_stripe_lock, which can only be released by caller. + * And it will definitely cause use-after-free when caller + * tries to release full stripe lock. + * + * No better way to resolve, but only to warn. + */ + WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); kfree(cache); } @@ -316,14 +327,14 @@ get_caching_control(struct btrfs_block_group_cache *cache) } ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); spin_unlock(&cache->lock); return ctl; } static void put_caching_control(struct btrfs_caching_control *ctl) { - if (atomic_dec_and_test(&ctl->count)) + if (refcount_dec_and_test(&ctl->count)) kfree(ctl); } @@ -599,7 +610,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; - atomic_set(&caching_ctl->count, 1); + refcount_set(&caching_ctl->count, 1); btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, caching_thread, NULL, NULL); @@ -620,7 +631,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_caching_control *ctl; ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&cache->lock); @@ -707,7 +718,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } down_write(&fs_info->commit_root_sem); - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->commit_root_sem); @@ -756,6 +767,26 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, return NULL; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + /* * after adding space to the filesystem, we need to clear the full flags * on all the space infos. @@ -892,7 +923,7 @@ search_again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -2082,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && @@ -2089,15 +2121,21 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, parent, root_objectid, - owner, offset, 0, - BTRFS_ADD_DELAYED_REF); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_ADD_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid); + return ret; } @@ -2401,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, head = btrfs_delayed_node_to_head(node); trace_run_delayed_ref_head(fs_info, node, head, node->action); + if (head->total_ref_mod < 0) { + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, node->bytenr); + ASSERT(cache); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + -node->num_bytes); + btrfs_put_block_group(cache); + } + if (insert_reserved) { btrfs_pin_extent(fs_info, node->bytenr, node->num_bytes, 1); @@ -2980,7 +3028,7 @@ again: struct btrfs_delayed_ref_node *ref; ref = &head->node; - atomic_inc(&ref->refs); + refcount_inc(&ref->refs); spin_unlock(&delayed_refs->lock); /* @@ -3003,7 +3051,6 @@ again: goto again; } out: - assert_qgroups_uptodate(trans); trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -3057,7 +3104,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -3355,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3443,7 +3491,8 @@ again: /* * don't bother trying to write stuff out _if_ * a) we're not cached, - * b) we're with nospace_cache mount option. + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -3473,7 +3522,7 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); if (ret) goto out_put; @@ -3504,6 +3553,7 @@ out: block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -3914,87 +3964,83 @@ static const char *alloc_name(u64 flags) }; } -static int update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, - struct btrfs_space_info **space_info) +static int create_space_info(struct btrfs_fs_info *info, u64 flags, + struct btrfs_space_info **new) { - struct btrfs_space_info *found; + + struct btrfs_space_info *space_info; int i; - int factor; int ret; - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - - found = __find_space_info(info, flags); - if (found) { - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; - return 0; - } - found = kzalloc(sizeof(*found), GFP_NOFS); - if (!found) + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); if (ret) { - kfree(found); + kfree(space_info); return ret; } for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&found->block_groups[i]); - init_rwsem(&found->groups_sem); - spin_lock_init(&found->lock); - found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - found->total_bytes = total_bytes; - found->disk_total = total_bytes * factor; - found->bytes_used = bytes_used; - found->disk_used = bytes_used * factor; - found->bytes_pinned = 0; - found->bytes_reserved = 0; - found->bytes_readonly = bytes_readonly; - found->bytes_may_use = 0; - found->full = 0; - found->max_extent_size = 0; - found->force_alloc = CHUNK_ALLOC_NO_FORCE; - found->chunk_alloc = 0; - found->flush = 0; - init_waitqueue_head(&found->wait); - INIT_LIST_HEAD(&found->ro_bgs); - INIT_LIST_HEAD(&found->tickets); - INIT_LIST_HEAD(&found->priority_tickets); - - ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + init_waitqueue_head(&space_info->wait); + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, info->space_info_kobj, "%s", - alloc_name(found->flags)); + alloc_name(space_info->flags)); if (ret) { - kfree(found); + percpu_counter_destroy(&space_info->total_bytes_pinned); + kfree(space_info); return ret; } - *space_info = found; - list_add_rcu(&found->list, &info->space_info); + *new = space_info; + list_add_rcu(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = found; + info->data_sinfo = space_info; return ret; } +static void update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + + found = __find_space_info(info, flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); + spin_unlock(&found->lock); + *space_info = found; +} + static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & @@ -4110,7 +4156,7 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return btrfs_reduce_alloc_profile(fs_info, flags); } -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; @@ -4127,6 +4173,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { @@ -4176,7 +4237,7 @@ again: data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); + alloc_target = btrfs_data_alloc_profile(fs_info); /* * It is ugly that we don't call nolock join * transaction for the free space inode case here. @@ -4227,7 +4288,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1, 0, + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } @@ -4267,12 +4328,8 @@ commit_trans: return ret; } -/* - * New check_data_free_space() with ability for precious data reservation - * Will replace old btrfs_check_data_free_space(), but for patch split, - * add a new function first and then replace it. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4287,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); - if (ret) + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -4330,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4340,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); } static void force_metadata_allocation(struct btrfs_fs_info *info) @@ -4452,9 +4512,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } if (left < thresh) { - u64 flags; + u64 flags = btrfs_system_alloc_profile(fs_info); - flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); /* * Ignore failure to create system chunk. We might end up not * needing it, as we might not need to COW all nodes/leafs from @@ -4495,10 +4554,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(fs_info, flags); if (!space_info) { - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); - BUG_ON(ret); /* -ENOMEM */ + ret = create_space_info(fs_info, flags, &space_info); + if (ret) + return ret; } - BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); @@ -4603,11 +4662,11 @@ out: return ret; } -static int can_overcommit(struct btrfs_root *root, +static int can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; u64 space_size; @@ -4618,7 +4677,11 @@ static int can_overcommit(struct btrfs_root *root, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - profile = btrfs_get_alloc_profile(root, 0); + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + used = btrfs_space_info_used(space_info, false); /* @@ -4635,9 +4698,7 @@ static int can_overcommit(struct btrfs_root *root, used += space_info->bytes_may_use; - spin_lock(&fs_info->free_chunk_lock); - avail = fs_info->free_chunk_space; - spin_unlock(&fs_info->free_chunk_lock); + avail = atomic64_read(&fs_info->free_chunk_space); /* * If we have dup, raid1 or raid10 then only half of the free @@ -4687,14 +4748,14 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, } } -static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { u64 bytes; - int nr; + u64 nr; bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = (int)div64_u64(to_reclaim, bytes); + nr = div64_u64(to_reclaim, bytes); if (!nr) nr = 1; return nr; @@ -4705,24 +4766,23 @@ static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 max_reclaim; + u64 items; long time_left; unsigned long nr_pages; int loops; - int items; enum btrfs_reserve_flush_enum flush; /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &fs_info->delalloc_block_rsv; @@ -4765,7 +4825,7 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, flush)) { + if (can_overcommit(fs_info, space_info, orig, flush, false)) { spin_unlock(&space_info->lock); break; } @@ -4827,14 +4887,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, spin_lock(&delayed_rsv->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes - delayed_rsv->size) >= 0) { + bytes - delayed_rsv->size) < 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); commit: - trans = btrfs_join_transaction(fs_info->fs_root); + trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return -ENOSPC; @@ -4852,7 +4912,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, u64 orig_bytes, int state) { - struct btrfs_root *root = fs_info->fs_root; + struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -4875,7 +4935,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes * 2, orig_bytes, + shrink_delalloc(fs_info, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -4885,7 +4945,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; } ret = do_chunk_alloc(trans, fs_info, - btrfs_get_alloc_profile(root, 0), + btrfs_metadata_alloc_profile(fs_info), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) @@ -4906,8 +4966,9 @@ static int flush_space(struct btrfs_fs_info *fs_info, } static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, - struct btrfs_space_info *space_info) +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) { struct reserve_ticket *ticket; u64 used; @@ -4922,14 +4983,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) return 0; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) + used = btrfs_space_info_used(space_info, true); + + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -4943,17 +5004,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_root *root, u64 used) +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -4990,8 +5052,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); @@ -5013,8 +5075,9 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); if (last_tickets_id == space_info->tickets_id) { @@ -5052,8 +5115,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state = FLUSH_DELAYED_ITEMS_NR; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { spin_unlock(&space_info->lock); return; @@ -5132,12 +5195,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -5159,7 +5222,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else if (can_overcommit(root, space_info, orig_bytes, flush)) { + } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); @@ -5186,7 +5250,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, orig_bytes, flush, "enospc"); queue_work(system_unbound_wq, - &root->fs_info->async_reclaim_work); + &fs_info->async_reclaim_work); } } else { list_add_tail(&ticket.list, @@ -5200,7 +5264,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(space_info, root, used) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && !work_busy(&fs_info->async_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); @@ -5258,9 +5323,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; + bool system_chunk = (root == fs_info->chunk_root); - ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, - flush); + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -5369,9 +5435,7 @@ static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, * overcommit, and if we can't then we just need to free up our space * and not satisfy any requests. */ - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = btrfs_space_info_used(space_info, true); if (used - num_bytes >= space_info->total_bytes) check_overcommit = true; again: @@ -5383,8 +5447,7 @@ again: * adding the ticket space would be a double count. */ if (check_overcommit && - !can_overcommit(fs_info->extent_root, space_info, 0, - flush)) + !can_overcommit(fs_info, space_info, 0, flush, false)) break; if (num_bytes >= ticket->bytes) { list_del_init(&ticket->list); @@ -6113,6 +6176,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * @inode: inode we're writing to * @start: start range we are writing to * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * @@ -6130,16 +6195,17 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; } @@ -6158,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, reserved, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -6237,6 +6304,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + num_bytes); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6313,6 +6382,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes); set_extent_dirty(fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -6783,27 +6853,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, - u64 owner, u64 root_objectid) -{ - struct btrfs_space_info *space_info; - u64 flags; - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; - } - - space_info = __find_space_info(fs_info, flags); - BUG_ON(!space_info); /* Logic bug */ - percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); -} - - static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info, struct btrfs_delayed_ref_node *node, u64 parent, @@ -7026,8 +7075,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } - add_pinned_bytes(info, -num_bytes, owner_objectid, - root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -7159,19 +7206,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - buf->start, buf->len, - parent, + int old_ref_mod, new_ref_mod; + + ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, + buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + BTRFS_DROP_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); BUG_ON(ret); /* -ENOMEM */ + pin = old_ref_mod >= 0 && new_ref_mod < 0; } - if (!last_ref) - return; - - if (btrfs_header_generation(buf) == trans->transid) { + if (last_ref && btrfs_header_generation(buf) == trans->transid) { struct btrfs_block_group_cache *cache; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -7180,6 +7227,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -7195,18 +7243,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); - pin = 0; } out: if (pin) add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), root->root_key.objectid); - /* - * Deleting the buffer, clear the corrupt flag since it doesn't matter - * anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + if (last_ref) { + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + } } /* Can return -ENOMEM */ @@ -7215,12 +7264,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; - add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); /* * tree log blocks never actually go into the extent allocation @@ -7230,19 +7279,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); + old_ref_mod = new_ref_mod = 0; ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, - offset, 0, - BTRFS_DROP_DELAYED_REF); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_DROP_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); + return ret; } @@ -7945,7 +8000,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 flags; int ret; - flags = btrfs_get_alloc_profile(root, is_data); + flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, @@ -8189,9 +8244,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, - ins->offset, 0, - root_objectid, owner, offset, - ram_bytes, BTRFS_ADD_DELAYED_EXTENT); + ins->offset, 0, root_objectid, owner, + offset, ram_bytes, + BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); return ret; } @@ -8411,11 +8466,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - ins.objectid, ins.offset, - parent, root_objectid, level, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, + ins.offset, parent, + root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + extent_op, NULL, NULL); if (ret) goto out_free_delayed; } @@ -9917,6 +9972,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); mutex_init(&cache->free_space_lock); + btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); return cache; } @@ -10047,19 +10103,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } trace_btrfs_add_block_group(info, cache, 0); - ret = update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&info->block_group_cache_lock); - btrfs_put_block_group(cache); - goto error; - } + update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); cache->space_info = space_info; @@ -10191,16 +10237,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } #endif /* - * Call to ensure the corresponding space_info object is created and - * assigned to our block group, but don't update its counters just yet. - * We want our bg to be added to the rbtree with its ->space_info set. + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. */ - ret = update_space_info(fs_info, cache->flags, 0, 0, 0, - &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; + cache->space_info = __find_space_info(fs_info, cache->flags); + if (!cache->space_info) { + ret = create_space_info(fs_info, cache->flags, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } } ret = btrfs_add_block_group_cache(fs_info, cache); @@ -10215,18 +10264,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - ret = update_space_info(fs_info, cache->flags, size, bytes_used, + update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &fs_info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&fs_info->block_group_cache_lock); - btrfs_put_block_group(cache); - return ret; - } update_global_block_rsv(fs_info); __link_block_group(cache->space_info, cache); @@ -10416,7 +10455,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->caching_block_groups, list) if (ctl->block_group == block_group) { caching_ctl = ctl; - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); break; } } @@ -10774,21 +10813,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } out: return ret; @@ -10850,7 +10889,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ret = find_free_dev_extent_start(trans, device, minlen, start, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 27fdb250b446..556484cf5d93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void) pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), - atomic_read(&state->refs)); + refcount_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } @@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void) static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct inode *inode; - u64 isize; - - if (!tree->mapping) - return; - - inode = tree->mapping->host; - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } + if (tree->ops && tree->ops->check_extent_io_range) + tree->ops->check_extent_io_range(tree->private_data, caller, + start, end); } #else #define btrfs_leak_debug_add(new, head) do {} while (0) @@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { - if (!tree->mapping) - return NULL; - return btrfs_sb(tree->mapping->host->i_sb); + if (tree->ops) + return tree->ops->tree_fs_info(tree->private_data); + return NULL; } int __init extent_io_init(void) @@ -174,7 +164,8 @@ int __init extent_io_init(void) goto free_state_cache; btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio)); + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS); if (!btrfs_bioset) goto free_buffer_cache; @@ -213,13 +204,13 @@ void extent_io_exit(void) } void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping) + void *private_data) { tree->state = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - tree->mapping = mapping; + tree->private_data = private_data; } static struct extent_state *alloc_extent_state(gfp_t mask) @@ -238,7 +229,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); btrfs_leak_debug_add(&state->leak_list, &states); - atomic_set(&state->refs, 1); + refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); return state; @@ -248,7 +239,7 @@ void free_extent_state(struct extent_state *state) { if (!state) return; - if (atomic_dec_and_test(&state->refs)) { + if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); @@ -369,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->mapping->host, new, - other); + tree->ops->merge_extent_hook(tree->private_data, new, other); } /* @@ -421,15 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->mapping->host, state, bits); + tree->ops->set_bit_hook(tree->private_data, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), - state, bits); + tree->ops->clear_bit_hook(tree->private_data, state, bits); } static void set_state_bits(struct extent_io_tree *tree, @@ -478,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, u64 split) { if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->mapping->host, orig, split); + tree->ops->split_extent_hook(tree->private_data, orig, split); } /* @@ -641,7 +630,7 @@ again: if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start) { if (clear) - atomic_dec(&cached->refs); + refcount_dec(&cached->refs); state = cached; goto hit_next; } @@ -793,7 +782,7 @@ process_node: if (state->state & bits) { start = state->start; - atomic_inc(&state->refs); + refcount_inc(&state->refs); wait_on_state(tree, state); free_extent_state(state); goto again; @@ -834,7 +823,7 @@ static void cache_state_if_flags(struct extent_state *state, if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { *cached_ptr = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } } } @@ -1402,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) */ static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); - put_page(page); - index++; - } + tree->ops->set_range_writeback(tree->private_data, start, end); } /* find the first state struct with 'bits' set after 'start', and @@ -1538,7 +1517,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, if (!found) { *start = state->start; *cached_state = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } found++; *end = state->end; @@ -1961,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec) { int ret; int err = 0; - struct extent_io_tree *failure_tree = &inode->io_failure_tree; set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, @@ -1974,7 +1954,7 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) if (ret) err = ret; - ret = clear_extent_bits(&inode->io_tree, rec->start, + ret = clear_extent_bits(io_tree, rec->start, rec->start + rec->len - 1, EXTENT_DAMAGED); if (ret && !err) @@ -1994,29 +1974,21 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, - u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); - /* we can't repair anything in raid56 yet */ - if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) - return 0; - - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; @@ -2026,17 +1998,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bbio, 0); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + ASSERT(bbio->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); } - BUG_ON(mirror_num != bbio->mirror_num); - sector = bbio->stripes[mirror_num-1].physical >> 9; + + sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[mirror_num-1].dev; + dev = bbio->stripes[bbio->mirror_num - 1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { btrfs_bio_counter_dec(fs_info); @@ -2057,7 +2047,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - btrfs_ino(inode), start, + ino, start, rcu_str_deref(dev->name), sector); btrfs_bio_counter_dec(fs_info); bio_put(bio); @@ -2077,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; - ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, - PAGE_SIZE, start, p, + ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, start - page_offset(p), mirror_num); if (ret) break; @@ -2092,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, - unsigned int pg_offset) +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset) { u64 private; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *state; int num_copies; int ret; private = 0; - ret = count_range_bits(&inode->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0); + ret = count_range_bits(failure_tree, &private, (u64)-1, 1, + EXTENT_DIRTY, 0); if (!ret) return 0; - ret = get_state_failrec(&inode->io_failure_tree, start, - &failrec); + ret = get_state_failrec(failure_tree, start, &failrec); if (ret) return 0; @@ -2125,25 +2114,25 @@ int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, if (fs_info->sb->s_flags & MS_RDONLY) goto out; - spin_lock(&inode->io_tree.lock); - state = find_first_extent_bit_state(&inode->io_tree, + spin_lock(&io_tree->lock); + state = find_first_extent_bit_state(io_tree, failrec->start, EXTENT_LOCKED); - spin_unlock(&inode->io_tree.lock); + spin_unlock(&io_tree->lock); if (state && state->start <= failrec->start && state->end >= failrec->start + failrec->len - 1) { num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { - repair_io_failure(inode, start, failrec->len, - failrec->logical, page, - pg_offset, failrec->failed_mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, + failrec->failed_mirror); } } out: - free_io_failure(inode, failrec); + free_io_failure(failure_tree, io_tree, failrec); return 0; } @@ -2343,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct btrfs_io_bio *btrfs_failed_bio; struct btrfs_io_bio *btrfs_bio; - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return NULL; - + bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = fs_info->fs_devices->latest_bdev; @@ -2384,8 +2370,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct io_failure_record *failrec; struct inode *inode = page->mapping->host; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int read_mode = 0; + blk_status_t status; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2396,7 +2384,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, tree, failrec); return -EIO; } @@ -2409,7 +2397,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, (int)phy_offset, failed_bio->bi_end_io, NULL); if (!bio) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, tree, failrec); return -EIO; } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); @@ -2418,11 +2406,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, + status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, failrec->bio_flags, 0); - if (ret) { - free_io_failure(BTRFS_I(inode), failrec); + if (status) { + free_io_failure(failure_tree, tree, failrec); bio_put(bio); + ret = blk_status_to_errno(status); } return ret; @@ -2445,7 +2434,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) if (!uptodate) { ClearPageUptodate(page); SetPageError(page); - ret = ret < 0 ? ret : -EIO; + ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } } @@ -2461,6 +2450,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio) { + int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; u64 end; @@ -2490,7 +2480,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - end_extent_writepage(page, bio->bi_error, start, end); + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2523,9 +2513,9 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct extent_io_tree *tree; + struct extent_io_tree *tree, *failure_tree; u64 offset = 0; u64 start; u64 end; @@ -2543,9 +2533,10 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_error, + (u64)bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will @@ -2575,8 +2566,10 @@ static void end_bio_extent_readpage(struct bio *bio) if (ret) uptodate = 0; else - clean_io_failure(BTRFS_I(inode), start, - page, 0); + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, tree, start, + page, + btrfs_ino(BTRFS_I(inode)), 0); } if (likely(uptodate)) @@ -2602,7 +2595,7 @@ static void end_bio_extent_readpage(struct bio *bio) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = !bio->bi_error; + uptodate = !bio->bi_status; offset += len; continue; } @@ -2660,77 +2653,80 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, bio->bi_error); + io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); bio_put(bio); } /* - * this allocates from the btrfs_bioset. We're returning a bio right now - * but you can call btrfs_io_bio for the appropriate container_of magic + * Initialize the members up to but not including 'bio'. Use after allocating a + * new bio by bio_alloc_bioset as it does not initialize the bytes outside of + * 'bio' because use of __GFP_ZERO is not supported. */ -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) { - struct btrfs_io_bio *btrfs_bio; - struct bio *bio; - - bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); + memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); +} - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) { - bio = bio_alloc_bioset(gfp_flags, - nr_vecs, btrfs_bioset); - } - } +/* + * The following helpers allocate a bio. As it's backed by a bioset, it'll + * never fail. We're returning a bio right now but you can call btrfs_io_bio + * for the appropriate container_of magic + */ +struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +{ + struct bio *bio; - if (bio) { - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = first_sector; - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = first_byte >> 9; + btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } -struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +struct bio *btrfs_bio_clone(struct bio *bio) { struct btrfs_io_bio *btrfs_bio; struct bio *new; - new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); - if (new) { - btrfs_bio = btrfs_io_bio(new); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + /* Bio allocation backed by a bioset does not fail */ + new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset); + btrfs_bio = btrfs_io_bio(new); + btrfs_io_bio_init(btrfs_bio); + btrfs_bio->iter = bio->bi_iter; return new; } -/* this also allocates from the btrfs_bioset */ -struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) { - struct btrfs_io_bio *btrfs_bio; struct bio *bio; - bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); - if (bio) { - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + /* Bio allocation backed by a bioset does not fail */ + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset); + btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) +{ + struct bio *bio; + struct btrfs_io_bio *btrfs_bio; + + /* this will never fail when it's backed by a bioset */ + bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + ASSERT(bio); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_io_bio_init(btrfs_bio); + + bio_trim(bio, offset >> 9, size >> 9); + btrfs_bio->iter = bio->bi_iter; + return bio; +} static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { - int ret = 0; + blk_status_t ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; @@ -2742,13 +2738,13 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio_get(bio); if (tree->ops) - ret = tree->ops->submit_bio_hook(page->mapping->host, bio, + ret = tree->ops->submit_bio_hook(tree->private_data, bio, mirror_num, bio_flags, start); else btrfsic_submit_bio(bio); bio_put(bio); - return ret; + return blk_status_to_errno(ret); } static int merge_bio(struct extent_io_tree *tree, struct page *page, @@ -2805,14 +2801,11 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, - GFP_NOFS | __GFP_HIGH); - if (!bio) - return -ENOMEM; - + bio = btrfs_bio_alloc(bdev, sector << 9); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio->bi_write_hint = page->mapping->host->i_write_hint; bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); @@ -2859,7 +2852,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } @@ -2870,7 +2863,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); - atomic_inc(&em->refs); + refcount_inc(&em->refs); *em_cached = em; } return em; @@ -3584,9 +3577,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - -eb->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); ret = 1; } else { spin_unlock(&eb->refs_lock); @@ -3694,7 +3687,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (bio->bi_error || + if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); @@ -3744,7 +3737,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb); memzero_extent_buffer(eb, start, end - start); } @@ -4364,6 +4357,119 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } +/* + * To cache previous fiemap extent + * + * Will be used for merging fiemap extent + */ +struct fiemap_cache { + u64 offset; + u64 phys; + u64 len; + u32 flags; + bool cached; +}; + +/* + * Helper to submit fiemap extent. + * + * Will try to merge current fiemap extent specified by @offset, @phys, + * @len and @flags with cached one. + * And only when we fails to merge, cached one will be submitted as + * fiemap extent. + * + * Return value is the same as fiemap_fill_next_extent(). + */ +static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + u64 offset, u64 phys, u64 len, u32 flags) +{ + int ret = 0; + + if (!cache->cached) + goto assign; + + /* + * Sanity check, extent_fiemap() should have ensured that new + * fiemap extent won't overlap with cahced one. + * Not recoverable. + * + * NOTE: Physical address can overlap, due to compression + */ + if (cache->offset + cache->len > offset) { + WARN_ON(1); + return -EINVAL; + } + + /* + * Only merges fiemap extents if + * 1) Their logical addresses are continuous + * + * 2) Their physical addresses are continuous + * So truly compressed (physical size smaller than logical size) + * extents won't get merged with each other + * + * 3) Share same flags except FIEMAP_EXTENT_LAST + * So regular extent won't get merged with prealloc extent + */ + if (cache->offset + cache->len == offset && + cache->phys + cache->len == phys && + (cache->flags & ~FIEMAP_EXTENT_LAST) == + (flags & ~FIEMAP_EXTENT_LAST)) { + cache->len += len; + cache->flags |= flags; + goto try_submit_last; + } + + /* Not mergeable, need to submit cached one */ + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret) + return ret; +assign: + cache->cached = true; + cache->offset = offset; + cache->phys = phys; + cache->len = len; + cache->flags = flags; +try_submit_last: + if (cache->flags & FIEMAP_EXTENT_LAST) { + ret = fiemap_fill_next_extent(fieinfo, cache->offset, + cache->phys, cache->len, cache->flags); + cache->cached = false; + } + return ret; +} + +/* + * Emit last fiemap cache + * + * The last fiemap cache may still be cached in the following case: + * 0 4k 8k + * |<- Fiemap range ->| + * |<------------ First extent ----------->| + * + * In this case, the first extent range will be cached but not emitted. + * So we must emit it before ending extent_fiemap(). + */ +static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + int ret; + + if (!cache->cached) + return 0; + + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret > 0) + ret = 0; + return ret; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4381,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; + struct fiemap_cache cache = { 0 }; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4560,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); + ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, + em_len, flags); if (ret) { if (ret == 1) ret = 0; @@ -4569,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } } out_free: + if (!ret) + ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache); free_extent_map(em); out: btrfs_free_path(path); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3e4fad4a909d..3fb8513bf02e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include <linux/rbtree.h> +#include <linux/refcount.h> #include "ulist.h" /* bits for the extent state */ @@ -14,14 +15,17 @@ #define EXTENT_DEFRAG (1U << 6) #define EXTENT_BOUNDARY (1U << 9) #define EXTENT_NODATASUM (1U << 10) -#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_CLEAR_META_RESV (1U << 11) #define EXTENT_FIRST_DELALLOC (1U << 12) #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_CLEAR_DATA_RESV (1U << 17) +#define EXTENT_DELALLOC_NEW (1U << 18) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* @@ -88,7 +92,7 @@ struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, +typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { @@ -104,32 +108,36 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); + struct btrfs_fs_info *(*tree_fs_info)(void *private_data); + void (*set_range_writeback)(void *private_data, u64 start, u64 end); /* * Optional hooks, called if the pointer is not NULL */ - int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + int (*fill_delalloc)(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - void (*set_bit_hook)(struct inode *inode, struct extent_state *state, + void (*set_bit_hook)(void *private_data, struct extent_state *state, unsigned *bits); - void (*clear_bit_hook)(struct btrfs_inode *inode, + void (*clear_bit_hook)(void *private_data, struct extent_state *state, unsigned *bits); - void (*merge_extent_hook)(struct inode *inode, + void (*merge_extent_hook)(void *private_data, struct extent_state *new, struct extent_state *other); - void (*split_extent_hook)(struct inode *inode, + void (*split_extent_hook)(void *private_data, struct extent_state *orig, u64 split); + void (*check_extent_io_range)(void *private_data, const char *caller, + u64 start, u64 end); }; struct extent_io_tree { struct rb_root state; - struct address_space *mapping; + void *private_data; u64 dirty_bytes; int track_uptodate; spinlock_t lock; @@ -143,7 +151,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; - atomic_t refs; + refcount_t refs; unsigned state; struct io_failure_record *failrec; @@ -201,12 +209,46 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - u64 bytes_changed; + unsigned int bytes_changed; /* Changed ranges */ struct ulist range_changed; }; +static inline void extent_changeset_init(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + ulist_init(&changeset->range_changed); +} + +static inline struct extent_changeset *extent_changeset_alloc(void) +{ + struct extent_changeset *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + extent_changeset_init(ret); + return ret; +} + +static inline void extent_changeset_release(struct extent_changeset *changeset) +{ + if (!changeset) + return; + changeset->bytes_changed = 0; + ulist_release(&changeset->range_changed); +} + +static inline void extent_changeset_free(struct extent_changeset *changeset) +{ + if (!changeset) + return; + extent_changeset_release(changeset); + kfree(changeset); +} + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -226,8 +268,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, u64 start, u64 len, int create); -void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping); +void extent_io_tree_init(struct extent_io_tree *tree, void *private_data); int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); @@ -455,20 +496,21 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, u64 delalloc_end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags); -struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); +struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); +struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio); +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); struct btrfs_fs_info; struct btrfs_inode; -int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, - u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); -int clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset); +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int mirror_num); @@ -503,7 +545,9 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, bio_end_io_t *endio_func, void *data); -int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec); +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 26f9ac719d20..69850155870c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void) em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; em->generation = 0; - atomic_set(&em->refs, 1); + refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; } @@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em) { if (!em) return; - WARN_ON(atomic_read(&em->refs) == 0); - if (atomic_dec_and_test(&em->refs)) { + WARN_ON(refcount_read(&em->refs) == 0); + if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) @@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); em->mod_start = em->start; em->mod_len = em->len; @@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, if (strict && !(end > em->start && start < extent_map_end(em))) return NULL; - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index eb8b8fae036b..a67b2def5413 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -2,6 +2,7 @@ #define __EXTENTMAP__ #include <linux/rbtree.h> +#include <linux/refcount.h> #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) @@ -41,7 +42,7 @@ struct extent_map { */ struct map_lookup *map_lookup; }; - atomic_t refs; + refcount_t refs; unsigned int compress_type; struct list_head list; }; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 64fcb31d7163..fdcb41002623 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -160,11 +160,12 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) kfree(bio->csum_allocated); } -static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -177,12 +178,12 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 page_bytes_left; u32 diff; int nblocks; - int count = 0, i; + int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + return BLK_STS_RESOURCE; nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { @@ -191,7 +192,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); - return -ENOMEM; + return BLK_STS_RESOURCE; } btrfs_bio->csum = btrfs_bio->csum_allocated; btrfs_bio->end_io = btrfs_io_bio_endio_readpage; @@ -206,8 +207,6 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (bio->bi_iter.bi_size > PAGE_SIZE * 8) path->reada = READA_FORWARD; - WARN_ON(bio->bi_vcnt <= 0); - /* * the free space stuff is only read when it hasn't been * updated in the current transaction. So, we can safely @@ -223,13 +222,13 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (dio) offset = logical_offset; - bio_for_each_segment_all(bvec, bio, i) { - page_bytes_left = bvec->bv_len; + bio_for_each_segment(bvec, bio, iter) { + page_bytes_left = bvec.bv_len; if (count) goto next; if (!dio) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, (u32 *)csum, nblocks); if (count) @@ -303,12 +302,12 @@ next: return 0; } -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) { return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); } @@ -433,26 +432,26 @@ fail: return ret; } -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; - struct bio_vec *bvec; + struct bvec_iter iter; + struct bio_vec bvec; int index; int nr_sectors; - int i, j; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; + int i; u64 offset; - WARN_ON(bio->bi_vcnt <= 0); sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) - return -ENOMEM; + return BLK_STS_RESOURCE; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); @@ -465,19 +464,19 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; - bio_for_each_segment_all(bvec, bio, j) { + bio_for_each_segment(bvec, bio, iter) { if (!contig) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ } - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, - bvec->bv_len + fs_info->sectorsize + bvec.bv_len + fs_info->sectorsize - 1); for (i = 0; i < nr_sectors; i++) { @@ -504,12 +503,12 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, + total_bytes; index = 0; - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); } sums->sums[index] = ~(u32)0; sums->sums[index] - = btrfs_csum_data(data + bvec->bv_offset + = btrfs_csum_data(data + bvec.bv_offset + (i * fs_info->sectorsize), sums->sums[index], fs_info->sectorsize); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 520cb7230b2d..9e75d8a39aac 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1404,6 +1404,47 @@ fail: } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, + search_len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + /* * This function locks the extent and properly waits for data=ordered extents * to finish before allowing the pages to be modified if need. @@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->vfs_inode.i_size) { + if (start_pos < inode->vfs_inode.i_size || + (inode->flags & BTRFS_INODE_PREALLOC)) { struct btrfs_ordered_extent *ordered; + unsigned int clear_bits; + lock_extent_bits(&inode->io_tree, start_pos, last_pos, cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - + ret = btrfs_find_new_delalloc_bytes(inode, start_pos, + last_pos - start_pos + 1, + cached_state); + clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG; + if (ret) + clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED; clear_extent_bit(&inode->io_tree, start_pos, - last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state, GFP_NOFS); + last_pos, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, cached_state, GFP_NOFS); + if (ret) + return ret; *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -1529,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; @@ -1576,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); - ret = btrfs_check_data_free_space(inode, pos, write_bytes); + extent_changeset_release(data_reserved); + ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + write_bytes); if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && @@ -1605,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, pos, - write_bytes); + btrfs_free_reserved_data_space(inode, + data_reserved, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1688,8 +1744,9 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, __pos, - release_bytes); + btrfs_delalloc_release_space(inode, + data_reserved, __pos, + release_bytes); } } @@ -1744,12 +1801,13 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { - btrfs_delalloc_release_space(inode, - round_down(pos, fs_info->sectorsize), - release_bytes); + btrfs_delalloc_release_space(inode, data_reserved, + round_down(pos, fs_info->sectorsize), + release_bytes); } } + extent_changeset_free(data_reserved); return num_written ? num_written : ret; } @@ -1824,17 +1882,36 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); ssize_t err; loff_t pos; - size_t count; + size_t count = iov_iter_count(from); loff_t oldsize; int clean_page = 0; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + err = generic_write_checks(iocb, from); if (err <= 0) { inode_unlock(inode); return err; } + pos = iocb->ki_pos; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* + * We will allocate space in case nodatacow is not set, + * so bail + */ + if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) || + check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + } + current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { @@ -1862,8 +1939,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); - pos = iocb->ki_pos; - count = iov_iter_count(from); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { @@ -1959,7 +2034,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; - int ret = 0; + int ret = 0, err; bool full_sync = 0; u64 len; @@ -1978,7 +2053,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) - return ret; + goto out; inode_lock(inode); atomic_inc(&root->log_batch); @@ -2083,10 +2158,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * An ordered extent might have started before and completed * already with io errors, in which case the inode was not * updated and we end up here. So check the inode's mapping - * flags for any errors that might have happened while doing - * writeback of file data. + * for any errors that might have happened since we last + * checked called fsync. */ - ret = filemap_check_errors(inode->i_mapping); + ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); inode_unlock(inode); goto out; } @@ -2175,6 +2250,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_end_transaction(trans); } out: + err = file_check_and_advance_wb_err(file); + if (!ret) + ret = err; return ret > 0 ? -EIO : ret; } @@ -2338,17 +2416,15 @@ out: */ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; int ret = 0; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); - return ret; - } + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + round_down(*start, fs_info->sectorsize), + round_up(*len, fs_info->sectorsize), 0); + if (IS_ERR(em)) + return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->block_start == EXTENT_MAP_HOLE) { @@ -2722,6 +2798,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; struct list_head reserve_list; @@ -2835,11 +2912,8 @@ static long btrfs_fallocate(struct file *file, int mode, while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); + if (IS_ERR(em)) { + ret = PTR_ERR(em); break; } last_byte = min(extent_map_end(em), alloc_end); @@ -2854,18 +2928,20 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, cur_offset, - last_byte - cur_offset); - if (ret < 0) + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + cur_offset, last_byte - cur_offset); + if (ret < 0) { + free_extent_map(em); break; + } } else { /* * Do not need to reserve unwritten extent for this * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, cur_offset, - last_byte - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2884,8 +2960,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, range->start, - range->len); + btrfs_free_reserved_data_space(inode, + data_reserved, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2923,8 +3000,9 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0) - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, alloc_end - cur_offset); + extent_changeset_free(data_reserved); return ret; } @@ -3025,13 +3103,19 @@ out: return offset; } +static int btrfs_file_open(struct inode *inode, struct file *filp) +{ + filp->f_mode |= FMODE_AIO_NOWAIT; + return generic_file_open(inode, filp); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, - .open = generic_file_open, + .open = btrfs_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index da6841efac26..c5e6180cdb8c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -355,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_SIZE; if (clear) - memset(io_ctl->cur, 0, PAGE_SIZE); + clear_page(io_ctl->cur); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index dd7fb22a955a..a5e34de06c2f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -17,7 +17,7 @@ */ #include <linux/kernel.h> -#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "locking.h" @@ -153,22 +153,21 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) static u8 *alloc_bitmap(u32 bitmap_size) { - void *mem; + u8 *ret; + unsigned int nofs_flag; /* - * The allocation size varies, observed numbers were < 4K up to 16K. - * Using vmalloc unconditionally would be too heavy, we'll try - * contiguous allocations first. + * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse + * into the filesystem as the free space bitmap can be modified in the + * critical section of a transaction commit. + * + * TODO: push the memalloc_nofs_{save,restore}() to the caller where we + * know that recursion is unsafe. */ - if (bitmap_size <= PAGE_SIZE) - return kzalloc(bitmap_size, GFP_NOFS); - - mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); - if (mem) - return mem; - - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + nofs_flag = memalloc_nofs_save(); + ret = kvzalloc(bitmap_size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + return ret; } int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, @@ -1189,11 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - return 0; + return btrfs_commit_transaction(trans); abort: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); @@ -1278,11 +1273,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) free_extent_buffer(free_space_root->commit_root); kfree(free_space_root); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - return 0; + return btrfs_commit_transaction(trans); abort: btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index a97fdc156a03..baacc1866861 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = tfm; @@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 5c6c20ec64d8..d02019747d00 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_path *path; struct inode *inode; struct btrfs_block_rsv *rsv; + struct extent_changeset *data_reserved = NULL; u64 num_bytes; u64 alloc_hint = 0; int ret; @@ -492,7 +493,7 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); if (ret) goto out_put; @@ -516,6 +517,7 @@ out: trans->bytes_reserved = num_bytes; btrfs_free_path(path); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e71f1ea3391..06dea7c89bbd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -86,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; @@ -115,6 +114,31 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, u64 ram_bytes, int compress_type, int type); +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate); + +/* + * Cleanup all submitted ordered extents in specified range to handle errors + * from the fill_dellaloc() callback. + * + * NOTE: caller must ensure that when an error happens, it can not call + * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING + * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata + * to be released, which we want to happen only when finishing the ordered + * extent (btrfs_finish_ordered_io()). Also note that the caller of the + * fill_delalloc() callback already does proper cleanup for the first page of + * the range, that is, it invokes the callback writepage_end_io_hook() for the + * range of the first page. + */ +static inline void btrfs_cleanup_ordered_extents(struct inode *inode, + const u64 offset, + const u64 bytes) +{ + return __endio_write_update_ordered(inode, offset + PAGE_SIZE, + bytes - PAGE_SIZE, false); +} + static int btrfs_dirty_inode(struct inode *inode); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -153,7 +177,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; - int err = 0; int ret; size_t cur_size = size; unsigned long offset; @@ -175,10 +198,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - if (ret) { - err = ret; + if (ret) goto fail; - } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -233,9 +254,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, BTRFS_I(inode)->disk_i_size = inode->i_size; ret = btrfs_update_inode(trans, root, inode); - return ret; fail: - return err; + return ret; } @@ -325,7 +345,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -547,7 +567,7 @@ cont: } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DEFRAG; + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; unsigned long page_error_op; clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; @@ -565,8 +585,10 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - btrfs_free_reserved_data_space_noquota(inode, start, - end - start + 1); + if (ret == 0) + btrfs_free_reserved_data_space_noquota(inode, + start, + end - start + 1); goto free_pages_out; } } @@ -581,12 +603,11 @@ cont: /* * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk + * win, compare the page count read with the blocks on disk, + * compression must free at least one sector size */ total_in = ALIGN(total_in, PAGE_SIZE); - if (total_compressed >= total_in) { - will_compress = 0; - } else { + if (total_compressed + blocksize <= total_in) { num_bytes = total_in; *num_added += 1; @@ -815,13 +836,12 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - ret = btrfs_submit_compressed_write(inode, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages); - if (ret) { + async_extent->nr_pages)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -852,6 +872,7 @@ out_free: async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | @@ -918,10 +939,13 @@ static noinline int cow_file_range(struct inode *inode, u64 num_bytes; unsigned long ram_size; u64 disk_num_bytes; - u64 cur_alloc_size; + u64 cur_alloc_size = 0; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; + unsigned clear_bits; + unsigned long page_ops; + bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { @@ -944,6 +968,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, start, end, delalloc_end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); @@ -966,14 +991,14 @@ static noinline int cow_file_range(struct inode *inode, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { - unsigned long op; - cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; + cur_alloc_size = ins.offset; + extent_reserved = true; ram_size = ins.offset; em = create_io_em(inode, start, ins.offset, /* len */ @@ -988,7 +1013,6 @@ static noinline int cow_file_range(struct inode *inode, goto out_reserve; free_extent_map(em); - cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) @@ -998,15 +1022,24 @@ static noinline int cow_file_range(struct inode *inode, BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); + /* + * Only drop cache here, and process as normal. + * + * We must not allow extent_clear_unlock_delalloc() + * at out_unlock label to free meta of this ordered + * extent, as its meta should be freed by + * btrfs_finish_ordered_io(). + * + * So we must continue until @start is increased to + * skip current ordered extent. + */ if (ret) - goto out_drop_extent_cache; + btrfs_drop_extent_cache(BTRFS_I(inode), start, + start + ram_size - 1, 0); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (disk_num_bytes < cur_alloc_size) - break; - /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits @@ -1014,18 +1047,30 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ - op = unlock ? PAGE_UNLOCK : 0; - op |= PAGE_SET_PRIVATE2; + page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops |= PAGE_SET_PRIVATE2; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, delalloc_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, - op); - disk_num_bytes -= cur_alloc_size; + page_ops); + if (disk_num_bytes < cur_alloc_size) + disk_num_bytes = 0; + else + disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; + extent_reserved = false; + + /* + * btrfs_reloc_clone_csums() error, since start is increased + * extent_clear_unlock_delalloc() at out_unlock label won't + * free metadata of current ordered extent, we're OK to exit. + */ + if (ret) + goto out_unlock; } out: return ret; @@ -1036,12 +1081,35 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK; + /* + * If we reserved an extent for our delalloc range (or a subrange) and + * failed to create the respective ordered extent, then it means that + * when we reserved the extent we decremented the extent's size from + * the data space_info's bytes_may_use counter and incremented the + * space_info's bytes_reserved counter by the same amount. We must make + * sure extent_clear_unlock_delalloc() does not try to decrement again + * the data space_info's bytes_may_use counter, therefore we do not pass + * it the flag EXTENT_CLEAR_DATA_RESV. + */ + if (extent_reserved) { + extent_clear_unlock_delalloc(inode, start, + start + cur_alloc_size, + start + cur_alloc_size, + locked_page, + clear_bits, + page_ops); + start += cur_alloc_size; + if (start >= end) + goto out; + } extent_clear_unlock_delalloc(inode, start, end, delalloc_end, locked_page, - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DELALLOC | EXTENT_DEFRAG, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + clear_bits | EXTENT_CLEAR_DATA_RESV, + page_ops); goto out; } @@ -1414,15 +1482,14 @@ out_check: BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + BTRFS_DATA_RELOC_TREE_OBJECTID) + /* + * Error handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler + * from freeing metadata of created ordered extent. + */ ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) { - if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); - goto error; - } - } extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, end, @@ -1434,6 +1501,14 @@ out_check: if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; + + /* + * btrfs_reloc_clone_csums() error, now we're OK to call error + * handler, as metadata for created ordered extent will only + * be freed by btrfs_finish_ordered_io(). + */ + if (ret) + goto error; if (cur_offset > end) break; } @@ -1487,10 +1562,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) /* * extent_io.c call back to do delayed allocation processing */ -static int run_delalloc_range(struct inode *inode, struct page *locked_page, +static int run_delalloc_range(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { + struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); @@ -1509,12 +1585,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); } + if (ret) + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } -static void btrfs_split_extent_hook(struct inode *inode, +static void btrfs_split_extent_hook(void *private_data, struct extent_state *orig, u64 split) { + struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1549,10 +1628,11 @@ static void btrfs_split_extent_hook(struct inode *inode, * extents, such as when we are doing sequential writes, so we can properly * account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(struct inode *inode, +static void btrfs_merge_extent_hook(void *private_data, struct extent_state *new, struct extent_state *other) { + struct inode *inode = private_data; u64 new_size, old_size; u32 num_extents; @@ -1652,9 +1732,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(struct inode *inode, +static void btrfs_set_bit_hook(void *private_data, struct extent_state *state, unsigned *bits) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1682,8 +1763,8 @@ static void btrfs_set_bit_hook(struct inode *inode, if (btrfs_is_testing(fs_info)) return; - __percpu_counter_add(&fs_info->delalloc_bytes, len, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->delalloc_bytes, len, + fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (*bits & EXTENT_DEFRAG) @@ -1693,15 +1774,24 @@ static void btrfs_set_bit_hook(struct inode *inode, btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } + + if (!(state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - + state->start; + spin_unlock(&BTRFS_I(inode)->lock); + } } /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static void btrfs_clear_bit_hook(struct btrfs_inode *inode, +static void btrfs_clear_bit_hook(void *private_data, struct extent_state *state, unsigned *bits) { + struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -1722,7 +1812,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { spin_lock(&inode->lock); inode->outstanding_extents -= num_extents; spin_unlock(&inode->lock); @@ -1733,7 +1823,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, * don't need to call dellalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_DO_ACCOUNTING && + if (*bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); @@ -1741,16 +1831,15 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, if (btrfs_is_testing(fs_info)) return; - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE) - && (*bits & (EXTENT_DO_ACCOUNTING | - EXTENT_CLEAR_DATA_RESV))) + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + do_list && !(state->state & EXTENT_NORESERVE) && + (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota( &inode->vfs_inode, state->start, len); - __percpu_counter_add(&fs_info->delalloc_bytes, -len, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, + fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; if (do_list && inode->delalloc_bytes == 0 && @@ -1759,6 +1848,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, btrfs_del_delalloc_inode(root, inode); spin_unlock(&inode->lock); } + + if ((state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&inode->lock); + ASSERT(inode->new_delalloc_bytes >= len); + inode->new_delalloc_bytes -= len; + spin_unlock(&inode->lock); + } } /* @@ -1802,11 +1899,12 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { - int ret = 0; + struct inode *inode = private_data; + blk_status_t ret = 0; ret = btrfs_csum_one_bio(inode, bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -1821,16 +1919,17 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1840,14 +1939,15 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - int ret = 0; + blk_status_t ret = 0; int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -1877,8 +1977,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, - bio_flags, bio_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, + bio_offset, inode, __btrfs_submit_bio_start, __btrfs_submit_bio_done); goto out; @@ -1892,8 +1992,8 @@ mapit: ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); out: - if (ret < 0) { - bio->bi_error = ret; + if (ret) { + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1936,6 +2036,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct page *page; struct inode *inode; u64 page_start; @@ -1973,7 +2074,7 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); @@ -1993,6 +2094,7 @@ out_page: unlock_page(page); put_page(page); kfree(fixup); + extent_changeset_free(data_reserved); } /* @@ -2044,6 +2146,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + u64 qg_released; int extent_inserted = 0; int ret; @@ -2099,13 +2202,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins); + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head */ - btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + if (ret < 0) + goto out; + qg_released = ret; + ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); out: btrfs_free_path(path); @@ -2791,6 +2898,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) u64 logical_len = ordered_extent->len; bool nolock; bool truncated = false; + bool range_locked = false; + bool clear_new_delalloc_bytes = false; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + clear_new_delalloc_bytes = true; nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); @@ -2820,7 +2934,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) @@ -2839,13 +2953,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + range_locked = true; lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, &cached_state); ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 1, cached_state); + EXTENT_DEFRAG, 0, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); if (0 && last_snapshot >= BTRFS_I(inode)->generation) @@ -2864,7 +2979,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; - goto out_unlock; + goto out; } trans->block_rsv = &fs_info->delalloc_block_rsv; @@ -2896,7 +3011,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } add_pending_csums(trans, inode, &ordered_extent->list); @@ -2905,14 +3020,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } ret = 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); out: + if (range_locked || clear_new_delalloc_bytes) { + unsigned int clear_bits = 0; + + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, + clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, &cached_state, GFP_NOFS); + } + if (root != fs_info->tree_root) btrfs_delalloc_release_metadata(BTRFS_I(inode), ordered_extent->len); @@ -4401,9 +4528,17 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); + + trace_btrfs_truncate_show_fi_regular( + BTRFS_I(inode), leaf, fi, + found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, path->slots[0], fi); + + trace_btrfs_truncate_show_fi_inline( + BTRFS_I(inode), leaf, fi, path->slots[0], + found_key.offset); } item_end--; } @@ -4603,13 +4738,6 @@ error: btrfs_free_path(path); - if (err == 0) { - /* only inline file may have last_size != new_size */ - if (new_size >= fs_info->sectorsize || - new_size > fs_info->max_inline) - ASSERT(last_size == new_size); - } - if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { @@ -4642,6 +4770,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4656,7 +4785,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4664,7 +4793,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, round_down(from, blocksize), blocksize); ret = -ENOMEM; @@ -4736,11 +4865,12 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, block_start, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); unlock_page(page); put_page(page); out: + extent_changeset_free(data_reserved); return ret; } @@ -5135,7 +5265,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state->state & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, start, end - start + 1); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -5748,7 +5878,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; @@ -5799,7 +5928,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) continue; } - item = btrfs_item_nr(slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) @@ -5814,7 +5942,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) ctx->pos = found_key.offset; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, di)) + if (verify_dir_item(fs_info, leaf, slot, di)) goto next; name_len = btrfs_dir_name_len(leaf, di); @@ -6735,7 +6863,6 @@ static noinline int uncompress_inline(struct btrfs_path *path, * * This also copies inline extents directly into the page. */ - struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, @@ -6835,11 +6962,18 @@ again: found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); + + trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, + extent_start); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, fs_info->sectorsize); + + trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, + path->slots[0], + extent_start); } next: if (start >= extent_end) { @@ -7037,19 +7171,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, em = btrfs_get_extent(inode, page, pg_offset, start, len, create); if (IS_ERR(em)) return em; - if (em) { - /* - * if our em maps to - * - a hole or - * - a pre-alloc extent, - * there might actually be delalloc bytes behind it. - */ - if (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return em; - else - hole_em = em; - } + /* + * If our em maps to: + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. + */ + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return em; + else + hole_em = em; /* check to see if we've wrapped (len == -1 or similar) */ end = start + len; @@ -7356,11 +7488,11 @@ out: bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) { struct radix_tree_root *root = &inode->i_mapping->page_tree; - int found = false; + bool found = false; void **pagep = NULL; struct page *page = NULL; - int start_idx; - int end_idx; + unsigned long start_idx; + unsigned long end_idx; start_idx = start >> PAGE_SHIFT; @@ -7854,9 +7986,12 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int isector; int read_mode = 0; + int segs; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -7868,13 +8003,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); return -EIO; } - if ((failed_bio->bi_vcnt > 1) - || (failed_bio->bi_io_vec->bv_len - > btrfs_inode_sectorsize(inode))) + segs = bio_segments(failed_bio); + if (segs > 1 || + (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; @@ -7882,7 +8017,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, pgoff, isector, repair_endio, repair_arg); if (!bio) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); return -EIO; } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); @@ -7893,7 +8028,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); bio_put(bio); } @@ -7910,19 +8045,24 @@ struct btrfs_retry_complete { static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; + struct inode *inode = done->inode; struct bio_vec *bvec; + struct extent_io_tree *io_tree, *failure_tree; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; ASSERT(bio->bi_vcnt == 1); - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + io_tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; + ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); done->uptodate = 1; bio_for_each_segment_all(bvec, bio, i) - clean_io_failure(BTRFS_I(done->inode), done->start, - bvec->bv_page, 0); + clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, + io_tree, done->start, bvec->bv_page, + btrfs_ino(BTRFS_I(inode)), 0); end: complete(&done->done); bio_put(bio); @@ -7932,36 +8072,40 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, struct btrfs_io_bio *io_bio) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; unsigned int pgoff; u32 sectorsize; int nr_sectors; - int i; int ret; + int err = 0; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); + pgoff = bvec.bv_offset; next_block_or_try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio_nocsum, &done); - if (ret) - return ret; + if (ret) { + err = ret; + goto next; + } wait_for_completion(&done.done); @@ -7970,6 +8114,7 @@ next_block_or_try_again: goto next_block_or_try_again; } +next: start += sectorsize; nr_sectors--; @@ -7980,19 +8125,21 @@ next_block_or_try_again: } } - return 0; + return err; } static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct extent_io_tree *io_tree, *failure_tree; + struct inode *inode = done->inode; struct bio_vec *bvec; int uptodate; int ret; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; uptodate = 1; @@ -8000,13 +8147,19 @@ static void btrfs_retry_endio(struct bio *bio) ASSERT(bio->bi_vcnt == 1); ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + io_tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; + bio_for_each_segment_all(bvec, bio, i) { - ret = __readpage_endio_check(done->inode, io_bio, i, - bvec->bv_page, bvec->bv_offset, - done->start, bvec->bv_len); + ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, + bvec->bv_offset, done->start, + bvec->bv_len); if (!ret) - clean_io_failure(BTRFS_I(done->inode), done->start, - bvec->bv_page, bvec->bv_offset); + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, io_tree, done->start, + bvec->bv_page, + btrfs_ino(BTRFS_I(inode)), + bvec->bv_offset); else uptodate = 0; } @@ -8017,11 +8170,12 @@ end: bio_put(bio); } -static int __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; u64 offset = 0; @@ -8029,7 +8183,7 @@ static int __btrfs_subio_endio_read(struct inode *inode, int nr_sectors; unsigned int pgoff; int csum_pos; - int i; + bool uptodate = (err == 0); int ret; fs_info = BTRFS_I(inode)->root->fs_info; @@ -8038,29 +8192,31 @@ static int __btrfs_subio_endio_read(struct inode *inode, err = 0; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec->bv_offset; + pgoff = bvec.bv_offset; next_block: - csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); - ret = __readpage_endio_check(inode, io_bio, csum_pos, - bvec->bv_page, pgoff, start, - sectorsize); - if (likely(!ret)) - goto next; + if (uptodate) { + csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); + ret = __readpage_endio_check(inode, io_bio, csum_pos, + bvec.bv_page, pgoff, start, sectorsize); + if (likely(!ret)) + goto next; + } try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio, &done); if (ret) { - err = ret; + err = errno_to_blk_status(ret); goto next; } @@ -8087,8 +8243,8 @@ next: return err; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8108,10 +8264,13 @@ static void btrfs_endio_direct_read(struct bio *bio) struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - int err = bio->bi_error; + blk_status_t err = bio->bi_status; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { err = btrfs_subio_endio_read(inode, io_bio, err); + if (!err) + bio->bi_status = 0; + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8119,25 +8278,34 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, blk_status_to_errno(err)); bio_put(bio); } -static void btrfs_endio_direct_write_update_ordered(struct inode *inode, - const u64 offset, - const u64 bytes, - const int uptodate) +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered = NULL; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; int ret; + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -8146,9 +8314,8 @@ again: if (!ret) goto out_test; - btrfs_init_work(&ordered->work, btrfs_endio_write_helper, - finish_ordered_fn, NULL, NULL); - btrfs_queue_work(fs_info->endio_write_workers, &ordered->work); + btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -8166,23 +8333,22 @@ static void btrfs_endio_direct_write(struct bio *bio) struct btrfs_dio_private *dip = bio->bi_private; struct bio *dio_bio = dip->dio_bio; - btrfs_endio_direct_write_update_ordered(dip->inode, - dip->logical_offset, - dip->bytes, - !bio->bi_error); + __endio_write_update_ordered(dip->inode, dip->logical_offset, + dip->bytes, !bio->bi_status); kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, +static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { - int ret; + struct inode *inode = private_data; + blk_status_t ret; ret = btrfs_csum_one_bio(inode, bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; @@ -8191,7 +8357,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -8221,31 +8387,21 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_error = 0; + dip->dio_bio->bi_status = 0; bio_endio(dip->orig_bio); } out: bio_put(bio); } -static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, - u64 first_sector, gfp_t gfp_flags) -{ - struct bio *bio; - bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); - if (bio) - bio_associate_current(bio); - return bio; -} - -static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, +static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 file_offset) { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - int ret; + blk_status_t ret; /* * We load all the csum data we need when we submit @@ -8276,7 +8432,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = bio_op(bio) == REQ_OP_WRITE; - int ret; + blk_status_t ret; if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -8293,8 +8449,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0, - file_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, + file_offset, inode, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; @@ -8324,103 +8480,83 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *bio; struct bio *orig_bio = dip->orig_bio; - struct bio_vec *bvec; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 submit_len = 0; u64 map_length; - u32 blocksize = fs_info->sectorsize; int async_submit = 0; - int nr_sectors; + u64 submit_len; + int clone_offset = 0; + int clone_len; int ret; - int i, j; map_length = orig_bio->bi_iter.bi_size; + submit_len = map_length; ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; - if (map_length >= orig_bio->bi_iter.bi_size) { + if (map_length >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) + if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); - if (!bio) - return -ENOMEM; - - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + /* bio split */ + ASSERT(map_length <= INT_MAX); atomic_inc(&dip->pending_bios); + do { + clone_len = min_t(int, submit_len, map_length); - bio_for_each_segment_all(bvec, orig_bio, j) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - i = 0; -next_block: - if (unlikely(map_length < submit_len + blocksize || - bio_add_page(bio, bvec->bv_page, blocksize, - bvec->bv_offset + (i * blocksize)) < blocksize)) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the dip might get freed - * before we're done setting it up - */ - atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, - file_offset, skip_sum, - async_submit); - if (ret) { - bio_put(bio); - atomic_dec(&dip->pending_bios); - goto out_err; - } - - start_sector += submit_len >> 9; - file_offset += submit_len; + /* + * This will never fail as it's passing GPF_NOFS and + * the allocation is backed by btrfs_bioset. + */ + bio = btrfs_bio_clone_partial(orig_bio, clone_offset, + clone_len); + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; + + ASSERT(submit_len >= clone_len); + submit_len -= clone_len; + if (submit_len == 0) + break; - submit_len = 0; + /* + * Increase the count before we submit the bio so we know + * the end IO handler won't happen before we increase the + * count. Otherwise, the dip might get freed before we're + * done setting it up. + */ + atomic_inc(&dip->pending_bios); - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, - start_sector, GFP_NOFS); - if (!bio) - goto out_err; - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } - map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, - &map_length, NULL, 0); - if (ret) { - bio_put(bio); - goto out_err; - } + clone_offset += clone_len; + start_sector += clone_len >> 9; + file_offset += clone_len; - goto next_block; - } else { - submit_len += blocksize; - if (--nr_sectors) { - i++; - goto next_block; - } - } - } + map_length = submit_len; + ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); + if (ret) + goto out_err; + } while (submit_len > 0); submit: ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, @@ -8447,19 +8583,15 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { struct btrfs_dio_private *dip = NULL; - struct bio *io_bio = NULL; - struct btrfs_io_bio *btrfs_bio; + struct bio *bio = NULL; + struct btrfs_io_bio *io_bio; int skip_sum; bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); - if (!io_bio) { - ret = -ENOMEM; - goto free_ordered; - } + bio = btrfs_bio_clone(dio_bio); dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { @@ -8472,17 +8604,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, dip->logical_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; - io_bio->bi_private = dip; - dip->orig_bio = io_bio; + bio->bi_private = dip; + dip->orig_bio = bio; dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); - btrfs_bio = btrfs_io_bio(io_bio); - btrfs_bio->logical = file_offset; + io_bio = btrfs_io_bio(bio); + io_bio->logical = file_offset; if (write) { - io_bio->bi_end_io = btrfs_endio_direct_write; + bio->bi_end_io = btrfs_endio_direct_write; } else { - io_bio->bi_end_io = btrfs_endio_direct_read; + bio->bi_end_io = btrfs_endio_direct_read; dip->subio_endio = btrfs_subio_endio_read; } @@ -8505,8 +8637,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (btrfs_bio->end_io) - btrfs_bio->end_io(btrfs_bio, ret); + if (io_bio->end_io) + io_bio->end_io(io_bio, ret); free_ordered: /* @@ -8518,35 +8650,34 @@ free_ordered: * same as btrfs_endio_direct_[write|read] because we can't call these * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (io_bio && dip) { - io_bio->bi_error = -EIO; - bio_endio(io_bio); + if (bio && dip) { + bio_io_error(bio); /* - * The end io callbacks free our dip, do the final put on io_bio + * The end io callbacks free our dip, do the final put on bio * and all the cleanup and final put for dio_bio (through * dio_end_io()). */ dip = NULL; - io_bio = NULL; + bio = NULL; } else { if (write) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, file_offset, dio_bio->bi_iter.bi_size, - 0); + false); else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - dio_bio->bi_error = -EIO; + dio_bio->bi_status = BLK_STS_IOERR; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. */ - dio_end_io(dio_bio, ret); + dio_end_io(dio_bio); } - if (io_bio) - bio_put(io_bio); + if (bio) + bio_put(bio); kfree(dip); } @@ -8590,6 +8721,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; @@ -8625,8 +8757,12 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; } - ret = btrfs_delalloc_reserve_space(inode, offset, count); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); if (ret) goto out; dio_data.outstanding_extents = count_max_extents(count); @@ -8658,8 +8794,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, offset, - dio_data.reserve); + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -8668,14 +8804,14 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ if (dio_data.unsubmitted_oe_range_start < dio_data.unsubmitted_oe_range_end) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, dio_data.unsubmitted_oe_range_start, dio_data.unsubmitted_oe_range_end - dio_data.unsubmitted_oe_range_start, - 0); + false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, offset, - count - (size_t)ret); + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret); } out: if (wakeup) @@ -8683,6 +8819,7 @@ out: if (relock) inode_lock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -8819,6 +8956,7 @@ again: if (!inode_evicting) clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); @@ -8872,12 +9010,12 @@ again: * free the entire extent. */ if (PageDirty(page)) - btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); @@ -8914,6 +9052,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; unsigned long zero_start; loff_t size; @@ -8939,7 +9078,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); if (!ret) { ret = file_update_time(vmf->vma->vm_file); @@ -8993,8 +9132,8 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, page_start, - PAGE_SIZE - reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE - reserved_space); } } @@ -9045,13 +9184,16 @@ again: out_unlock: if (!ret) { sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return ret; } @@ -9248,6 +9390,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; @@ -9272,8 +9415,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(&ei->io_tree, &inode->i_data); - extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + extent_io_tree_init(&ei->io_tree, inode); + extent_io_tree_init(&ei->io_failure_tree, inode); ei->io_tree.track_uptodate = 1; ei->io_failure_tree.track_uptodate = 1; atomic_set(&ei->sync_writers, 0); @@ -9313,6 +9456,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); WARN_ON(BTRFS_I(inode)->defrag_bytes); @@ -9381,7 +9525,6 @@ void btrfs_destroy_cachep(void) rcu_barrier(); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); - kmem_cache_destroy(btrfs_transaction_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); } @@ -9401,12 +9544,6 @@ int btrfs_init_cachep(void) if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", - sizeof(struct btrfs_transaction), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_transaction_cachep) - goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_MEM_SPREAD, NULL); @@ -9431,12 +9568,30 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, u64 delalloc_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; + u32 bi_flags = BTRFS_I(inode)->flags; + + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + if (bi_flags & BTRFS_INODE_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (bi_flags & BTRFS_INODE_COMPRESS) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (bi_flags & BTRFS_INODE_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (bi_flags & BTRFS_INODE_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); - delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; @@ -10405,7 +10560,7 @@ next: btrfs_end_transaction(trans); } if (cur_offset < end) - btrfs_free_reserved_data_space(inode, cur_offset, + btrfs_free_reserved_data_space(inode, NULL, cur_offset, end - cur_offset + 1); return ret; } @@ -10526,6 +10681,42 @@ static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) return -EAGAIN; } +static struct btrfs_fs_info *iotree_fs_info(void *private_data) +{ + struct inode *inode = private_data; + return btrfs_sb(inode->i_sb); +} + +static void btrfs_check_extent_io_range(void *private_data, const char *caller, + u64 start, u64 end) +{ + struct inode *inode = private_data; + u64 isize; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } +} + +void btrfs_set_range_writeback(void *private_data, u64 start, u64 end) +{ + struct inode *inode = private_data; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + ASSERT(page); /* Pages should be in the extent_io_tree */ + set_page_writeback(page); + put_page(page); + index++; + } +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10569,6 +10760,8 @@ static const struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, + .tree_fs_info = iotree_fs_info, + .set_range_writeback = btrfs_set_range_writeback, /* optional callbacks */ .fill_delalloc = run_delalloc_range, @@ -10578,6 +10771,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = { .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, .split_extent_hook = btrfs_split_extent_hook, + .check_extent_io_range = btrfs_check_extent_io_range, }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dabfc7ac48a6..fa1b78cf25f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -37,7 +37,7 @@ #include <linux/bit_spinlock.h> #include <linux/security.h> #include <linux/xattr.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/uuid.h> @@ -689,7 +689,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; + struct extent_changeset *data_reserved = NULL; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); file_end = (isize - 1) >> PAGE_SHIFT; @@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) @@ -1226,7 +1227,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT); } @@ -1247,15 +1248,17 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + extent_changeset_free(data_reserved); return i_done; out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + extent_changeset_free(data_reserved); return ret; } @@ -1504,7 +1507,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1619,7 +1622,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); mnt_drop_write_file(file); return ret; } @@ -2661,7 +2664,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; mutex_lock(&fs_info->volume_mutex); @@ -2680,7 +2683,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return ret; } @@ -2708,7 +2711,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) return -EOPNOTSUPP; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -2721,7 +2724,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = btrfs_rm_device(fs_info, vol_args->name, 0); } mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -2752,7 +2755,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -2772,7 +2775,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out_drop_write: mnt_drop_write_file(file); @@ -3539,12 +3542,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); - if (!buf) { - buf = vmalloc(fs_info->nodesize); - if (!buf) - return ret; - } + buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!buf) + return ret; path = btrfs_alloc_path(); if (!path) { @@ -4442,13 +4442,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - atomic_set( - &fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -4593,7 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, out: btrfs_free_path(path); - vfree(inodes); + kvfree(inodes); kfree(loi); return ret; @@ -4643,7 +4641,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); need_unlock = true; @@ -4689,7 +4687,7 @@ again: } locked: - BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); + BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4745,11 +4743,10 @@ locked: do_balance: /* - * Ownership of bctl and mutually_exclusive_operation_running + * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP * goes to to btrfs_balance. bctl is freed in __cancel_balance, * or, if restriper was paused all the way until unmount, in - * free_fs_info. mutually_exclusive_operation_running is - * cleared in __cancel_balance. + * free_fs_info. The flag is cleared in __cancel_balance. */ need_unlock = false; @@ -4769,7 +4766,7 @@ out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); if (need_unlock) - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out: mnt_drop_write_file(file); return ret; @@ -4903,7 +4900,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->assign) { ret = btrfs_add_qgroup_relation(trans, fs_info, sa->src, sa->dst); @@ -4962,7 +4958,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->create) { ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid); } else { @@ -5016,7 +5011,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = root->root_key.objectid; } - /* FIXME: check if the IDs really exist */ ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim); err = btrfs_end_transaction(trans); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index f48c8c14dc14..d433e75d489a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -18,13 +18,14 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/lzo.h> +#include <linux/refcount.h> #include "compression.h" #define LZO_LEN 4 @@ -40,9 +41,9 @@ static void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->buf); - vfree(workspace->cbuf); - vfree(workspace->mem); + kvfree(workspace->buf); + kvfree(workspace->cbuf); + kvfree(workspace->mem); kfree(workspace); } @@ -50,13 +51,13 @@ static struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); - workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); - workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); - workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); + workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -141,7 +142,7 @@ static int lzo_compress_pages(struct list_head *ws, ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); if (ret != LZO_E_OK) { - pr_debug("BTRFS: deflate in loop returned %d\n", + pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; goto out; @@ -229,8 +230,10 @@ static int lzo_compress_pages(struct list_head *ws, in_len = min(bytes_left, PAGE_SIZE); } - if (tot_out > tot_in) + if (tot_out >= tot_in) { + ret = -E2BIG; goto out; + } /* store the size of all chunks of compressed data */ cpage_out = kmap(pages[0]); @@ -254,16 +257,13 @@ out: return ret; } -static int lzo_decompress_bio(struct list_head *ws, - struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen) +static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; char *data_in; unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; @@ -278,6 +278,9 @@ static int lzo_decompress_bio(struct list_head *ws, unsigned long tot_len; char *buf; bool may_late_unmap, need_unmap; + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9a46878ba60f..a3aca495e33e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); /* one ref for the tree */ - atomic_set(&entry->refs, 1); + refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); @@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -425,7 +425,7 @@ have_entry: out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct btrfs_inode *inode, if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) continue; list_add(&ordered->log_list, logged_list); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); } @@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); - if (atomic_dec_and_test(&entry->refs)) { + if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->log_list)); ASSERT(list_empty(&entry->trans_list)); ASSERT(list_empty(&entry->root_extent_list)); @@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ASSERT(trans); @@ -663,7 +663,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -671,7 +671,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, LIST_HEAD(skipped); LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; - int count = 0; + u64 count = 0; const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); @@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, list_move_tail(&ordered->root_extent_list, &root->ordered_extents); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, @@ -701,7 +701,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, cond_resched(); spin_lock(&root->ordered_extent_lock); - if (nr != -1) + if (nr != U64_MAX) nr--; count++; } @@ -720,13 +720,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, return count; } -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, - const u64 range_start, const u64 range_len) +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; - int done; - int total_done = 0; + u64 total_done = 0; + u64 done; INIT_LIST_HEAD(&splice); @@ -748,9 +748,8 @@ int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, total_done += done; spin_lock(&fs_info->ordered_root_lock); - if (nr != -1) { + if (nr != U64_MAX) { nr -= done; - WARN_ON(nr < 0); } } list_splice_tail(&splice, &fs_info->ordered_roots); @@ -870,7 +869,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (!offset_in_entry(entry, file_offset)) entry = NULL; if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; @@ -911,7 +910,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( } out: if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); spin_unlock_irq(&tree->lock); return entry; } @@ -948,7 +947,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 195c93b67fe0..56c4c0ee6381 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -113,7 +113,7 @@ struct btrfs_ordered_extent { int compress_type; /* reference count */ - atomic_t refs; + refcount_t refs; /* the inode we belong to */ struct inode *inode; @@ -200,9 +200,9 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct btrfs_inode *inode, struct list_head *logged_list, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index cdafbf92ef0c..fcae61e175f3 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -261,8 +261,11 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l) case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); - pr_info("\t\tblock group used %llu\n", - btrfs_disk_block_group_used(l, bi)); + pr_info( + "\t\tblock group used %llu chunk_objectid %llu flags %llu\n", + btrfs_disk_block_group_used(l, bi), + btrfs_disk_block_group_chunk_objectid(l, bi), + btrfs_disk_block_group_flags(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: print_chunk(l, btrfs_item_ptr(l, i, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index d6cb155ef7a1..4b23ae5d0e5c 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -164,6 +164,7 @@ static int iterate_object_props(struct btrfs_root *root, size_t), void *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *name_buf = NULL; char *value_buf = NULL; @@ -214,6 +215,12 @@ static int iterate_object_props(struct btrfs_root *root, name_ptr = (unsigned long)(di + 1); data_ptr = name_ptr + name_len; + if (verify_dir_item(fs_info, leaf, + path->slots[0], di)) { + ret = -EIO; + goto out; + } + if (name_len <= XATTR_BTRFS_PREFIX_LEN || memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, name_ptr, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index afbea61d957e..4ce351efe281 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -47,50 +47,6 @@ * - check all ioctl parameters */ -/* - * one struct for each qgroup, organized in fs_info->qgroup_tree. - */ -struct btrfs_qgroup { - u64 qgroupid; - - /* - * state - */ - u64 rfer; /* referenced */ - u64 rfer_cmpr; /* referenced compressed */ - u64 excl; /* exclusive */ - u64 excl_cmpr; /* exclusive compressed */ - - /* - * limits - */ - u64 lim_flags; /* which limits are set */ - u64 max_rfer; - u64 max_excl; - u64 rsv_rfer; - u64 rsv_excl; - - /* - * reservation tracking - */ - u64 reserved; - - /* - * lists - */ - struct list_head groups; /* groups this group is member of */ - struct list_head members; /* groups that are members of this group */ - struct list_head dirty; /* dirty groups */ - struct rb_node node; /* tree of qgroups */ - - /* - * temp variables for accounting operations - * Refer to qgroup_shared_accounting() for details. - */ - u64 old_refcnt; - u64 new_refcnt; -}; - static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { @@ -1078,6 +1034,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; if (sign > 0) { + trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes); if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); else @@ -1103,6 +1060,8 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; if (sign > 0) { + trace_qgroup_update_reserve(fs_info, qgroup, + -(s64)num_bytes); if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); @@ -1447,38 +1406,6 @@ out: return ret; } -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - u64 qgroup_to_skip; - int ret = 0; - - delayed_refs = &trans->transaction->delayed_refs; - qgroup_to_skip = delayed_refs->qgroup_to_skip; - - /* - * No need to do lock, since this function will only be called in - * btrfs_commit_transaction(). - */ - node = rb_first(&delayed_refs->dirty_extent_root); - while (node) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - if (WARN_ON(!record->old_roots)) - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, &record->old_roots); - if (ret < 0) - break; - if (qgroup_to_skip) - ulist_del(record->old_roots, qgroup_to_skip, 0); - node = rb_next(node); - } - return ret; -} - int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -1600,6 +1527,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, if (ret) return ret; } + cond_resched(); return 0; } @@ -1959,6 +1887,35 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, return 0; } +/* + * Check if the @roots potentially is a list of fs tree roots + * + * Return 0 for definitely not a fs/subvol tree roots ulist + * Return 1 for possible fs/subvol tree roots in the list (considering an empty + * one as well) + */ +static int maybe_fs_roots(struct ulist *roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + + /* Empty one, still possible for fs roots */ + if (!roots || roots->nnodes == 0) + return 1; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); + if (!unode) + return 1; + + /* + * If it contains fs tree roots, then it must belong to fs/subvol + * trees. + * If it contains a non-fs tree, it won't be shared with fs/subvol trees. + */ + return is_fstree(unode->val); +} + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -1975,10 +1932,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - if (new_roots) + if (new_roots) { + if (!maybe_fs_roots(new_roots)) + goto out_free; nr_new_roots = new_roots->nnodes; - if (old_roots) + } + if (old_roots) { + if (!maybe_fs_roots(old_roots)) + goto out_free; nr_old_roots = old_roots->nnodes; + } + + /* Quick exit, either not fs tree roots, or won't affect any qgroup */ + if (nr_old_roots == 0 && nr_new_roots == 0) + goto out_free; BUG_ON(!fs_info->quota_root); @@ -2058,16 +2025,32 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, if (!ret) { /* - * Use (u64)-1 as time_seq to do special search, which + * Old roots should be searched when inserting qgroup + * extent record + */ + if (WARN_ON(!record->old_roots)) { + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(NULL, fs_info, + record->bytenr, 0, + &record->old_roots); + if (ret < 0) + goto cleanup; + } + + /* + * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current * root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, (u64)-1, &new_roots); + record->bytenr, SEQ_LAST, &new_roots); if (ret < 0) goto cleanup; - if (qgroup_to_skip) + if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); + ulist_del(record->old_roots, qgroup_to_skip, + 0); + } ret = btrfs_qgroup_account_extent(trans, fs_info, record->bytenr, record->num_bytes, record->old_roots, new_roots); @@ -2370,6 +2353,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; + int retried = 0; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2379,6 +2363,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) if (num_bytes == 0) return 0; + if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && + capable(CAP_SYS_RESOURCE)) + enforce = false; + +retry: spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; if (!quota_root) @@ -2405,6 +2394,27 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) qg = unode_aux_to_qgroup(unode); if (enforce && !qgroup_check_limits(qg, num_bytes)) { + /* + * Commit the tree and retry, since we may have + * deletions which would free up space. + */ + if (!retried && qg->reserved > 0) { + struct btrfs_trans_handle *trans; + + spin_unlock(&fs_info->qgroup_lock); + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + retried++; + goto retry; + } ret = -EDQUOT; goto out; } @@ -2427,6 +2437,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) qg = unode_aux_to_qgroup(unode); + trace_qgroup_update_reserve(fs_info, qg, num_bytes); qg->reserved += num_bytes; } @@ -2472,6 +2483,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); + trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes); if (qg->reserved < num_bytes) report_reserved_underflow(fs_info, qg, num_bytes); else @@ -2490,18 +2502,6 @@ out: spin_unlock(&fs_info->qgroup_lock); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) -{ - if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) - return; - btrfs_err(trans->fs_info, - "qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x", - trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); -} - /* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. @@ -2835,70 +2835,146 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. + * if btrfs_qgroup_reserve_data() is called multiple times with + * same @reserved, caller must ensure when error happens it's OK + * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator uiter; + struct extent_changeset *reserved; + u64 orig_reserved; + u64 to_reserve; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || !is_fstree(root->objectid) || len == 0) return 0; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* @reserved parameter is mandatory for qgroup */ + if (WARN_ON(!reserved_ret)) + return -EINVAL; + if (!*reserved_ret) { + *reserved_ret = extent_changeset_alloc(); + if (!*reserved_ret) + return -ENOMEM; + } + reserved = *reserved_ret; + /* Record already reserved space */ + orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, reserved); + + /* Newly reserved space */ + to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(inode, start, len, - changeset.bytes_changed, - QGROUP_RESERVE); + to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, changeset.bytes_changed, true); + ret = qgroup_reserve(root, to_reserve, true); if (ret < 0) goto cleanup; - ulist_release(&changeset.range_changed); return ret; cleanup: - /* cleanup already reserved ranges */ + /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&changeset.range_changed, &uiter))) + while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, GFP_NOFS); - ulist_release(&changeset.range_changed); + extent_changeset_release(reserved); return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, - int free) +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; + int freed = 0; + int ret; + + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + ret = freed; +out: + extent_changeset_release(&changeset); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; - if (free) { - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, - changeset.bytes_changed); + if (free) trace_op = QGROUP_FREE; - } trace_btrfs_qgroup_release_data(inode, start, len, changeset.bytes_changed, trace_op); + if (free) + btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, + BTRFS_I(inode)->root->objectid, + changeset.bytes_changed); + ret = changeset.bytes_changed; out: - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); return ret; } @@ -2907,14 +2983,17 @@ out: * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* @@ -2934,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, @@ -2948,6 +3027,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); + trace_qgroup_meta_reserve(root, (s64)num_bytes); ret = qgroup_reserve(root, num_bytes, enforce); if (ret < 0) return ret; @@ -2967,6 +3047,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root) reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0); if (reserved == 0) return; + trace_qgroup_meta_reserve(root, -(s64)reserved); btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved); } @@ -2981,6 +3062,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes); atomic64_sub(num_bytes, &root->qgroup_meta_rsv); + trace_qgroup_meta_reserve(root, -(s64)num_bytes); btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes); } @@ -2995,8 +3077,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) struct ulist_iterator iter; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); @@ -3013,5 +3094,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) changeset.bytes_changed); } - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 26932a8a1993..d9984e87cddf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -62,6 +62,50 @@ struct btrfs_qgroup_extent_record { }; /* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + * Refer to qgroup_shared_accounting() for details. + */ + u64 old_refcnt; + u64 new_refcnt; +}; + +/* * For qgroup event trace points only */ #define QGROUP_RESERVE (1<<0) @@ -90,8 +134,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); + /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. @@ -186,17 +229,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes); -/* - * TODO: Add proper trace point for it, as btrfs_qgroup_free() is - * called by everywhere, can't provide good trace for delayed ref case. - */ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, @@ -204,9 +242,11 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, bool enforce); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1571bf26dc07..6f845d219cd6 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -31,7 +31,7 @@ #include <linux/hash.h> #include <linux/list_sort.h> #include <linux/raid/xor.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -149,7 +149,7 @@ struct btrfs_raid_bio { int generic_bio_cnt; - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; @@ -218,12 +218,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) * of a failing mount. */ table_size = sizeof(*table) + sizeof(*h) * num_entries; - table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!table) { - table = vzalloc(table_size); - if (!table) - return -ENOMEM; - } + table = kvzalloc(table_size, GFP_KERNEL); + if (!table) + return -ENOMEM; spin_lock_init(&table->cache_lock); INIT_LIST_HEAD(&table->stripe_cache); @@ -389,7 +386,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (bio_list_empty(&rbio->bio_list)) { if (!list_empty(&rbio->hash_list)) { list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); BUG_ON(!list_empty(&rbio->plug_list)); } } @@ -480,7 +477,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) /* bump our ref if we were not in the list before */ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); if (!list_empty(&rbio->stripe_cache)){ list_move(&rbio->stripe_cache, &table->stripe_cache); @@ -689,7 +686,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) test_bit(RBIO_CACHE_BIT, &cur->flags) && !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { list_del_init(&cur->hash_list); - atomic_dec(&cur->refs); + refcount_dec(&cur->refs); steal_rbio(cur, rbio); cache_drop = cur; @@ -738,7 +735,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) } } lockit: - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: spin_unlock_irqrestore(&h->lock, flags); @@ -784,7 +781,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); /* * we use the plug list to hold all the rbios @@ -801,7 +798,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_del_init(&rbio->plug_list); list_add(&next->hash_list, &h->hash_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); @@ -843,8 +840,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; - WARN_ON(atomic_read(&rbio->refs) < 0); - if (!atomic_dec_and_test(&rbio->refs)) + if (!refcount_dec_and_test(&rbio->refs)) return; WARN_ON(!list_empty(&rbio->stripe_cache)); @@ -872,7 +868,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -885,7 +881,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_error = err; + cur->bi_status = err; bio_endio(cur); cur = next; } @@ -898,7 +894,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; int max_errors; if (err) @@ -915,7 +911,7 @@ static void raid_write_end_io(struct bio *bio) max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 0 : rbio->bbio->max_errors; if (atomic_read(&rbio->error) > max_errors) - err = -EIO; + err = BLK_STS_IOERR; rbio_orig_end_io(rbio, err); } @@ -997,7 +993,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->stripe_npages = stripe_npages; rbio->faila = -1; rbio->failb = -1; - atomic_set(&rbio->refs, 1); + refcount_set(&rbio->refs, 1); atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); @@ -1093,7 +1089,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - !last->bi_error && + !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) @@ -1102,10 +1098,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); - if (!bio) - return -ENOMEM; - + bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; @@ -1449,7 +1442,7 @@ static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1992,7 +1985,7 @@ static void raid_recover_end_io(struct bio *bio) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2118,6 +2111,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_raid_bio *rbio; int ret; + if (generic_io) { + ASSERT(bbio->mirror_num == mirror_num); + btrfs_io_bio(bio)->mirror_num = mirror_num; + } + rbio = alloc_rbio(fs_info, bbio, stripe_len); if (IS_ERR(rbio)) { if (generic_io) @@ -2194,6 +2192,8 @@ static void read_rebuild_work(struct btrfs_work *work) /* * The following code is used to scrub/replace the parity stripe * + * Caller must have already increased bio_counter for getting @bbio. + * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. @@ -2231,6 +2231,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, ASSERT(rbio->stripe_npages == stripe_nsectors); bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + /* + * We have already increased bio_counter when getting bbio, record it + * so we can free it at rbio_orig_end_io(). + */ + rbio->generic_bio_cnt = 1; + return rbio; } @@ -2518,7 +2524,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2673,6 +2679,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, return NULL; } + /* + * When we get bbio, we have already increased bio_counter, record it + * so we can free it at rbio_orig_end_io() + */ + rbio->generic_bio_cnt = 1; + return rbio; } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e88bca87f5d2..ab852b8e3e37 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - int err; struct list_head extctl; int refcnt; spinlock_t lock; @@ -209,9 +208,9 @@ cleanup: return; } -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err) +int btree_readahead_hook(struct extent_buffer *eb, int err) { + struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; struct reada_extent *re; @@ -235,10 +234,10 @@ start_machine: return ret; } -static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev, u64 logical, +static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = dev->fs_info; int ret; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; @@ -270,6 +269,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, if (!zone) return NULL; + ret = radix_tree_preload(GFP_KERNEL); + if (ret) { + kfree(zone); + return NULL; + } + zone->start = start; zone->end = end; INIT_LIST_HEAD(&zone->list); @@ -299,6 +304,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, zone = NULL; } spin_unlock(&fs_info->reada_lock); + radix_tree_preload_end(); return zone; } @@ -313,7 +319,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; - u32 blocksize; u64 length; int real_stripes; int nzones = 0; @@ -334,7 +339,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!re) return NULL; - blocksize = fs_info->nodesize; re->logical = logical; re->top = *top; INIT_LIST_HEAD(&re->extctl); @@ -344,10 +348,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, /* * map block */ - length = blocksize; + length = fs_info->nodesize; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &length, &bbio, 0); - if (ret || !bbio || length < blocksize) + if (ret || !bbio || length < fs_info->nodesize) goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { @@ -367,7 +371,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!dev->bdev) continue; - zone = reada_find_zone(fs_info, dev, logical, bbio); + zone = reada_find_zone(dev, logical, bbio); if (!zone) continue; @@ -386,6 +390,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + ret = radix_tree_preload(GFP_KERNEL); + if (ret) + goto error; + /* insert extent in reada_tree + all per-device trees, all or nothing */ btrfs_dev_replace_lock(&fs_info->dev_replace, 0); spin_lock(&fs_info->reada_lock); @@ -395,13 +403,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } + radix_tree_preload_end(); prev_dev = NULL; dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( &fs_info->dev_replace); @@ -639,9 +650,9 @@ static int reada_pick_zone(struct btrfs_device *dev) return 1; } -static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) +static int reada_start_machine_dev(struct btrfs_device *dev) { + struct btrfs_fs_info *fs_info = dev->fs_info; struct reada_extent *re = NULL; int mirror_num = 0; struct extent_buffer *eb = NULL; @@ -754,8 +765,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(fs_info, - device); + enqueued += reada_start_machine_dev(device); } mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d60df51959f7..65661d1aae4e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset; + struct extent_changeset *data_reserved = NULL; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, prealloc_start, + ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, prealloc_end + 1 - prealloc_start); if (ret) goto out; @@ -3113,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; if (cur_offset < start) - btrfs_free_reserved_data_space(inode, cur_offset, - start - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); @@ -3125,10 +3126,11 @@ int prealloc_file_extent_cluster(struct inode *inode, nr++; } if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, cur_offset, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -4269,8 +4271,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&rc->processed_blocks, NULL); return rc; } @@ -4372,7 +4373,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); - btrfs_wait_ordered_roots(fs_info, -1, + btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group->key.objectid, rc->block_group->key.offset); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a08224eab8b4..460db0cb2d07 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -390,6 +390,13 @@ again: WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); + ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr, + name_len); + if (!ret) { + err = -EIO; + goto out; + } + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); *sequence = btrfs_root_ref_sequence(leaf, ref); @@ -501,8 +508,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_root_item *item = &root->root_item; - struct timespec ct = current_fs_time(root->fs_info->sb); + struct timespec ct; + ktime_get_real_ts(&ct); spin_lock(&root->root_item_lock); btrfs_set_root_ctransid(item, trans->transid); btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b0251eb1239f..6f1e4c984b94 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -18,6 +18,7 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -64,7 +65,7 @@ struct scrub_ctx; #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_recover { - atomic_t refs; + refcount_t refs; struct btrfs_bio *bbio; u64 map_length; }; @@ -95,7 +96,7 @@ struct scrub_bio { struct scrub_ctx *sctx; struct btrfs_device *dev; struct bio *bio; - int err; + blk_status_t status; u64 logical; u64 physical; #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO @@ -112,7 +113,7 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t refs; /* free mem on transition to zero */ + refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; struct { @@ -140,9 +141,9 @@ struct scrub_parity { int nsectors; - int stripe_len; + u64 stripe_len; - atomic_t refs; + refcount_t refs; struct list_head spages; @@ -161,14 +162,6 @@ struct scrub_parity { unsigned long bitmap[0]; }; -struct scrub_wr_ctx { - struct scrub_bio *wr_curr_bio; - struct btrfs_device *tgtdev; - int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ - atomic_t flush_all_writes; - struct mutex wr_lock; -}; - struct scrub_ctx { struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; struct btrfs_fs_info *fs_info; @@ -183,11 +176,14 @@ struct scrub_ctx { atomic_t cancel_req; int readonly; int pages_per_rd_bio; - u32 sectorsize; - u32 nodesize; int is_dev_replace; - struct scrub_wr_ctx wr_ctx; + + struct scrub_bio *wr_curr_bio; + struct mutex wr_lock; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct btrfs_device *wr_tgtdev; /* * statistics @@ -202,7 +198,7 @@ struct scrub_ctx { * doesn't free the scrub context before or while the workers are * doing the wakeup() call. */ - atomic_t refs; + refcount_t refs; }; struct scrub_fixup_nodatasum { @@ -240,6 +236,13 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct full_stripe_lock { + struct rb_node node; + u64 logical; + u64 refs; + struct mutex mutex; +}; + static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); @@ -282,10 +285,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, u64 *extent_physical, struct btrfs_device **extent_dev, int *extent_mirror_num); -static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx, - struct btrfs_device *dev, - int is_dev_replace); -static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); @@ -305,7 +304,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } @@ -349,6 +348,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) } /* + * Insert new full stripe lock into full stripe locks tree + * + * Return pointer to existing or newly inserted full_stripe_lock structure if + * everything works well. + * Return ERR_PTR(-ENOMEM) if we failed to allocate memory + * + * NOTE: caller must hold full_stripe_locks_root->lock before calling this + * function + */ +static struct full_stripe_lock *insert_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct full_stripe_lock *entry; + struct full_stripe_lock *ret; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + p = &locks_root->root.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) { + p = &(*p)->rb_left; + } else if (fstripe_logical > entry->logical) { + p = &(*p)->rb_right; + } else { + entry->refs++; + return entry; + } + } + + /* Insert new lock */ + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + ret->logical = fstripe_logical; + ret->refs = 1; + mutex_init(&ret->mutex); + + rb_link_node(&ret->node, parent, p); + rb_insert_color(&ret->node, &locks_root->root); + return ret; +} + +/* + * Search for a full stripe lock of a block group + * + * Return pointer to existing full stripe lock if found + * Return NULL if not found + */ +static struct full_stripe_lock *search_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node *node; + struct full_stripe_lock *entry; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + node = locks_root->root.rb_node; + while (node) { + entry = rb_entry(node, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) + node = node->rb_left; + else if (fstripe_logical > entry->logical) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * Helper to get full stripe logical from a normal bytenr. + * + * Caller must ensure @cache is a RAID56 block group. + */ +static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, + u64 bytenr) +{ + u64 ret; + + /* + * Due to chunk item size limit, full stripe length should not be + * larger than U32_MAX. Just a sanity check here. + */ + WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); + + /* + * round_down() can only handle power of 2, while RAID56 full + * stripe length can be 64KiB * n, so we need to manually round down. + */ + ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * + cache->full_stripe_len + cache->key.objectid; + return ret; +} + +/* + * Lock a full stripe to avoid concurrency of recovery and read + * + * It's only used for profiles with parities (RAID5/6), for other profiles it + * does nothing. + * + * Return 0 if we locked full stripe covering @bytenr, with a mutex held. + * So caller must call unlock_full_stripe() at the same context. + * + * Return <0 if encounters error. + */ +static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool *locked_ret) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *existing; + u64 fstripe_start; + int ret = 0; + + *locked_ret = false; + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + + /* Profiles not based on parity don't need full stripe lock */ + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + locks_root = &bg_cache->full_stripe_locks_root; + + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + /* Now insert the full stripe lock */ + mutex_lock(&locks_root->lock); + existing = insert_full_stripe_lock(locks_root, fstripe_start); + mutex_unlock(&locks_root->lock); + if (IS_ERR(existing)) { + ret = PTR_ERR(existing); + goto out; + } + mutex_lock(&existing->mutex); + *locked_ret = true; +out: + btrfs_put_block_group(bg_cache); + return ret; +} + +/* + * Unlock a full stripe. + * + * NOTE: Caller must ensure it's the same context calling corresponding + * lock_full_stripe(). + * + * Return 0 if we unlock full stripe without problem. + * Return <0 for error + */ +static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool locked) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *fstripe_lock; + u64 fstripe_start; + bool freeit = false; + int ret = 0; + + /* If we didn't acquire full stripe lock, no need to continue */ + if (!locked) + return 0; + + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + + locks_root = &bg_cache->full_stripe_locks_root; + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + mutex_lock(&locks_root->lock); + fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); + /* Unpaired unlock_full_stripe() detected */ + if (!fstripe_lock) { + WARN_ON(1); + ret = -ENOENT; + mutex_unlock(&locks_root->lock); + goto out; + } + + if (fstripe_lock->refs == 0) { + WARN_ON(1); + btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", + fstripe_lock->logical); + } else { + fstripe_lock->refs--; + } + + if (fstripe_lock->refs == 0) { + rb_erase(&fstripe_lock->node, &locks_root->root); + freeit = true; + } + mutex_unlock(&locks_root->lock); + + mutex_unlock(&fstripe_lock->mutex); + if (freeit) + kfree(fstripe_lock); +out: + btrfs_put_block_group(bg_cache); + return ret; +} + +/* * used for workers that require transaction commits (i.e., for the * NOCOW case) */ @@ -356,7 +571,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -420,8 +635,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - scrub_free_wr_ctx(&sctx->wr_ctx); - /* this can happen when scrub is cancelled */ if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; @@ -441,13 +654,14 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sbio); } + kfree(sctx->wr_curr_bio); scrub_free_csums(sctx); kfree(sctx); } static void scrub_put_ctx(struct scrub_ctx *sctx) { - if (atomic_dec_and_test(&sctx->refs)) + if (refcount_dec_and_test(&sctx->refs)) scrub_free_ctx(sctx); } @@ -457,12 +671,11 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->fs_info; - int ret; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; - atomic_set(&sctx->refs, 1); + refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; @@ -487,8 +700,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->bios[i]->next_free = -1; } sctx->first_free = 0; - sctx->nodesize = fs_info->nodesize; - sctx->sectorsize = fs_info->sectorsize; atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); @@ -499,12 +710,16 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); - ret = scrub_setup_wr_ctx(&sctx->wr_ctx, - fs_info->dev_replace.tgtdev, is_dev_replace); - if (ret) { - scrub_free_ctx(sctx); - return ERR_PTR(ret); + WARN_ON(sctx->wr_curr_bio != NULL); + mutex_init(&sctx->wr_lock); + sctx->wr_curr_bio = NULL; + if (is_dev_replace) { + WARN_ON(!fs_info->dev_replace.tgtdev); + sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; + sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; + atomic_set(&sctx->flush_all_writes, 0); } + return sctx; nomem: @@ -519,6 +734,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, u32 nlink; int ret; int i; + unsigned nofs_flag; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; @@ -557,7 +773,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); + /* + * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub + * uses GFP_NOFS in this context, so we keep it consistent but it does + * not seem to be strictly necessary. + */ + nofs_flag = memalloc_nofs_save(); ipath = init_ipath(4096, local_root, swarn->path); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; @@ -731,7 +954,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) ret = -EIO; goto out; } - ret = repair_io_failure(BTRFS_I(inode), offset, PAGE_SIZE, + ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE, fixup->logical, page, offset - page_offset(page), fixup->mirror_num); @@ -857,12 +1080,14 @@ out: static inline void scrub_get_recover(struct scrub_recover *recover) { - atomic_inc(&recover->refs); + refcount_inc(&recover->refs); } -static inline void scrub_put_recover(struct scrub_recover *recover) +static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, + struct scrub_recover *recover) { - if (atomic_dec_and_test(&recover->refs)) { + if (refcount_dec_and_test(&recover->refs)) { + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(recover->bbio); kfree(recover); } @@ -892,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int mirror_index; int page_num; int success; + bool full_stripe_locked; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -917,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + /* + * For RAID5/6, race can happen for a different device scrub thread. + * For data corruption, Parity and Data threads will both try + * to recovery the data. + * Race can lead to doubly added csum error, or even unrecoverable + * error. + */ + ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); + if (ret < 0) { + spin_lock(&sctx->stat_lock); + if (ret == -ENOMEM) + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + return ret; + } + if (sctx->is_dev_replace && !is_metadata && !have_csum) { sblocks_for_recheck = NULL; goto nodatasum_case; @@ -1241,7 +1485,7 @@ out: sblock->pagev[page_index]->sblock = NULL; recover = sblock->pagev[page_index]->recover; if (recover) { - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); sblock->pagev[page_index]->recover = NULL; } @@ -1251,6 +1495,9 @@ out: kfree(sblocks_for_recheck); } + ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + if (ret < 0) + return ret; return 0; } @@ -1330,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bbio, 0, 1); + logical, &mapped_length, &bbio); if (ret || !bbio || mapped_length < sublen) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -ENOMEM; } - atomic_set(&recover->refs, 1); + refcount_set(&recover->refs, 1); recover->bbio = bbio; recover->map_length = mapped_length; @@ -1365,7 +1615,7 @@ leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); return -ENOMEM; } scrub_page_get(page); @@ -1407,7 +1657,7 @@ leave_nomem: scrub_get_recover(recover); page->recover = recover; } - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; page_index++; @@ -1418,14 +1668,14 @@ leave_nomem: struct scrub_bio_ret { struct completion event; - int error; + blk_status_t status; }; static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->status = bio->bi_status; complete(&ret->event); } @@ -1443,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, int ret; init_completion(&done.event); - done.error = 0; + done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; @@ -1455,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, return ret; wait_for_completion(&done.event); - if (done.error) + if (done.status) return -EIO; return 0; @@ -1487,24 +1737,23 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - page->io_error = 1; - sblock->no_io_error_seen = 0; - continue; - } + bio = btrfs_io_bio_alloc(1); bio->bi_bdev = page->dev->bdev; bio_add_page(bio, page->page, PAGE_SIZE, 0); if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { - if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) + if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } else { bio->bi_iter.bi_sector = page->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(bio)) + if (btrfsic_submit_bio_wait(bio)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } bio_put(bio); @@ -1576,9 +1825,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + bio = btrfs_io_bio_alloc(1); bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -1634,7 +1881,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, if (spage->io_error) { void *mapped_buffer = kmap_atomic(spage->page); - memset(mapped_buffer, 0, PAGE_SIZE); + clear_page(mapped_buffer); flush_dcache_page(spage->page); kunmap_atomic(mapped_buffer); } @@ -1644,37 +1891,31 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { - struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; struct scrub_bio *sbio; int ret; - mutex_lock(&wr_ctx->wr_lock); + mutex_lock(&sctx->wr_lock); again: - if (!wr_ctx->wr_curr_bio) { - wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + if (!sctx->wr_curr_bio) { + sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), GFP_KERNEL); - if (!wr_ctx->wr_curr_bio) { - mutex_unlock(&wr_ctx->wr_lock); + if (!sctx->wr_curr_bio) { + mutex_unlock(&sctx->wr_lock); return -ENOMEM; } - wr_ctx->wr_curr_bio->sctx = sctx; - wr_ctx->wr_curr_bio->page_count = 0; + sctx->wr_curr_bio->sctx = sctx; + sctx->wr_curr_bio->page_count = 0; } - sbio = wr_ctx->wr_curr_bio; + sbio = sctx->wr_curr_bio; if (sbio->page_count == 0) { struct bio *bio; sbio->physical = spage->physical_for_dev_replace; sbio->logical = spage->logical; - sbio->dev = wr_ctx->tgtdev; + sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_KERNEL, - wr_ctx->pages_per_wr_bio); - if (!bio) { - mutex_unlock(&wr_ctx->wr_lock); - return -ENOMEM; - } + bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); sbio->bio = bio; } @@ -1683,7 +1924,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -1697,7 +1938,7 @@ again: if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; - mutex_unlock(&wr_ctx->wr_lock); + mutex_unlock(&sctx->wr_lock); return -EIO; } scrub_wr_submit(sctx); @@ -1707,23 +1948,22 @@ again: sbio->pagev[sbio->page_count] = spage; scrub_page_get(spage); sbio->page_count++; - if (sbio->page_count == wr_ctx->pages_per_wr_bio) + if (sbio->page_count == sctx->pages_per_wr_bio) scrub_wr_submit(sctx); - mutex_unlock(&wr_ctx->wr_lock); + mutex_unlock(&sctx->wr_lock); return 0; } static void scrub_wr_submit(struct scrub_ctx *sctx) { - struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; struct scrub_bio *sbio; - if (!wr_ctx->wr_curr_bio) + if (!sctx->wr_curr_bio) return; - sbio = wr_ctx->wr_curr_bio; - wr_ctx->wr_curr_bio = NULL; + sbio = sctx->wr_curr_bio; + sctx->wr_curr_bio = NULL; WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer @@ -1738,7 +1978,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -1753,7 +1993,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) int i; WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); - if (sbio->err) { + if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -1827,7 +2067,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) page = sblock->pagev[0]->page; buffer = kmap_atomic(page); - len = sctx->sectorsize; + len = sctx->fs_info->sectorsize; index = 0; for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); @@ -1892,7 +2132,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) sblock->header_error = 1; - len = sctx->nodesize - BTRFS_CSUM_SIZE; + len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; index = 0; @@ -1998,12 +2238,12 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->refs); + refcount_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->refs)) { + if (refcount_dec_and_test(&sblock->refs)) { int i; if (sblock->sparity) @@ -2075,10 +2315,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_KERNEL, - sctx->pages_per_rd_bio); - if (!bio) - return -ENOMEM; + bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); sbio->bio = bio; } @@ -2087,7 +2324,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -2123,7 +2360,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) sblock->no_io_error_seen = 0; bio_put(bio); @@ -2166,10 +2403,10 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) scrub_block_put(sblock); if (sctx->is_dev_replace && - atomic_read(&sctx->wr_ctx.flush_all_writes)) { - mutex_lock(&sctx->wr_ctx.wr_lock); + atomic_read(&sctx->flush_all_writes)) { + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); } scrub_pending_bio_dec(sctx); @@ -2187,8 +2424,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) int ret; int i; + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2203,10 +2441,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) goto bbio_out; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - goto bbio_out; - + bio = btrfs_io_bio_alloc(0); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; @@ -2231,6 +2466,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2255,7 +2491,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2332,7 +2568,7 @@ static void scrub_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2345,7 +2581,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); - if (sbio->err) { + if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -2372,10 +2608,10 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) spin_unlock(&sctx->list_lock); if (sctx->is_dev_replace && - atomic_read(&sctx->wr_ctx.flush_all_writes)) { - mutex_lock(&sctx->wr_ctx.wr_lock); + atomic_read(&sctx->flush_all_writes)) { + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); } scrub_pending_bio_dec(sctx); @@ -2385,7 +2621,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - u32 offset; + u64 offset; int nsectors; int sectorsize = sparity->sctx->fs_info->sectorsize; @@ -2395,8 +2631,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - start = div_u64_rem(start, sparity->stripe_len, &offset); - offset /= sectorsize; + start = div64_u64_rem(start, sparity->stripe_len, &offset); + offset = div_u64(offset, sectorsize); nsectors = (int)len / sectorsize; if (offset + nsectors <= sparity->nsectors) { @@ -2470,8 +2706,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) if (!sum) return 0; - index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; - num_sectors = sum->len / sctx->sectorsize; + index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize; + num_sectors = sum->len / sctx->fs_info->sectorsize; memcpy(csum, sum->sums + index, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); @@ -2490,19 +2726,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u32 blocksize; if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed++; sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->nodesize; + blocksize = sctx->fs_info->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; sctx->stat.tree_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; WARN_ON(1); } @@ -2555,7 +2791,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; sblock->sparity = sparity; @@ -2636,11 +2872,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, } if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->nodesize; + blocksize = sctx->fs_info->nodesize; } else { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; WARN_ON(1); } @@ -2694,7 +2930,7 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div64_u64(*offset, map->stripe_len); stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ @@ -2748,7 +2984,7 @@ static void scrub_parity_bio_endio(struct bio *bio) struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2765,7 +3001,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; - struct scrub_page *spage; struct btrfs_bio *bbio = NULL; u64 length; int ret; @@ -2775,15 +3010,14 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto out; length = sparity->logic_end - sparity->logic_start; + + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - goto bbio_out; - + bio = btrfs_io_bio_alloc(0); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; @@ -2795,9 +3029,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (!rbio) goto rbio_out; - list_for_each_entry(spage, &sparity->spages, list) - raid56_add_scrub_pages(rbio, spage->page, spage->logical); - scrub_pending_bio_inc(sctx); raid56_parity_submit_scrub_rbio(rbio); return; @@ -2805,6 +3036,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2822,12 +3054,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors) static void scrub_parity_get(struct scrub_parity *sparity) { - atomic_inc(&sparity->refs); + refcount_inc(&sparity->refs); } static void scrub_parity_put(struct scrub_parity *sparity) { - if (!atomic_dec_and_test(&sparity->refs)) + if (!refcount_dec_and_test(&sparity->refs)) return; scrub_parity_check_and_repair(sparity); @@ -2879,7 +3111,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->scrub_dev = sdev; sparity->logic_start = logic_start; sparity->logic_end = logic_end; - atomic_set(&sparity->refs, 1); + refcount_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->spages); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; @@ -3050,9 +3282,9 @@ out: logic_end - logic_start); scrub_parity_put(sparity); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); btrfs_release_path(path); return ret < 0 ? ret : 0; @@ -3098,7 +3330,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; - nstripes = div_u64(length, map->stripe_len); + nstripes = div64_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; @@ -3208,14 +3440,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ - atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + atomic_set(&sctx->flush_all_writes, 1); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_set(&sctx->flush_all_writes, 0); scrub_blocked_if_needed(fs_info); } @@ -3422,9 +3654,9 @@ skip: out: /* push queued extents */ scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); blk_finish_plug(&plug); btrfs_free_path(path); @@ -3604,7 +3836,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ btrfs_wait_block_group_reservations(cache); btrfs_wait_nocow_writers(cache); - ret = btrfs_wait_ordered_roots(fs_info, -1, + ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->key.objectid, cache->key.offset); if (ret > 0) { @@ -3661,11 +3893,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * write requests are really completed when bios_in_flight * changes to 0. */ - atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + atomic_set(&sctx->flush_all_writes, 1); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); @@ -3679,7 +3911,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_set(&sctx->flush_all_writes, 0); scrub_pause_off(fs_info); @@ -4082,32 +4314,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, btrfs_put_bbio(bbio); } -static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx, - struct btrfs_device *dev, - int is_dev_replace) -{ - WARN_ON(wr_ctx->wr_curr_bio != NULL); - - mutex_init(&wr_ctx->wr_lock); - wr_ctx->wr_curr_bio = NULL; - if (!is_dev_replace) - return 0; - - WARN_ON(!dev->bdev); - wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; - wr_ctx->tgtdev = dev; - atomic_set(&wr_ctx->flush_all_writes, 0); - return 0; -} - -static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) -{ - mutex_lock(&wr_ctx->wr_lock); - kfree(wr_ctx->wr_curr_bio); - wr_ctx->wr_curr_bio = NULL; - mutex_unlock(&wr_ctx->wr_lock); -} - static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace) { @@ -4410,7 +4616,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, struct btrfs_device *dev; int ret; - dev = sctx->wr_ctx.tgtdev; + dev = sctx->wr_tgtdev; if (!dev) return -EIO; if (!dev->bdev) { @@ -4418,13 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a60d5bfb8a49..e937c10b8287 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1069,6 +1069,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } } + ret = btrfs_is_name_len_valid(eb, path->slots[0], + (unsigned long)(di + 1), name_len + data_len); + if (!ret) { + ret = -EIO; + goto out; + } if (name_len + data_len > buf_len) { buf_len = name_len + data_len; if (is_vmalloc_addr(buf)) { @@ -1083,7 +1089,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, buf = tmp; } if (!buf) { - buf = vmalloc(buf_len); + buf = kvmalloc(buf_len, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; @@ -2769,15 +2775,20 @@ out: struct recorded_ref { struct list_head list; - char *dir_path; char *name; struct fs_path *full_path; u64 dir; u64 dir_gen; - int dir_path_len; int name_len; }; +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; +} + /* * We need to process new refs before deleted refs, but compare_tree gives us * everything mixed. So we first record all refs and later process them. @@ -2794,17 +2805,7 @@ static int __record_ref(struct list_head *head, u64 dir, ref->dir = dir; ref->dir_gen = dir_gen; - ref->full_path = path; - - ref->name = (char *)kbasename(ref->full_path->start); - ref->name_len = ref->full_path->end - ref->name; - ref->dir_path = ref->full_path->start; - if (ref->name == ref->full_path->start) - ref->dir_path_len = 0; - else - ref->dir_path_len = ref->full_path->end - - ref->full_path->start - 1 - ref->name_len; - + set_ref_path(ref, path); list_add_tail(&ref->list, head); return 0; } @@ -3546,9 +3547,17 @@ static int is_ancestor(struct btrfs_root *root, struct fs_path *fs_path) { u64 ino = ino2; + bool free_path = false; + int ret = 0; + + if (!fs_path) { + fs_path = fs_path_alloc(); + if (!fs_path) + return -ENOMEM; + free_path = true; + } while (ino > BTRFS_FIRST_FREE_OBJECTID) { - int ret; u64 parent; u64 parent_gen; @@ -3557,13 +3566,18 @@ static int is_ancestor(struct btrfs_root *root, if (ret < 0) { if (ret == -ENOENT && ino == ino2) ret = 0; - return ret; + goto out; + } + if (parent == ino1) { + ret = parent_gen == ino1_gen ? 1 : 0; + goto out; } - if (parent == ino1) - return parent_gen == ino1_gen ? 1 : 0; ino = parent; } - return 0; + out: + if (free_path) + fs_path_free(fs_path); + return ret; } static int wait_for_parent_move(struct send_ctx *sctx, @@ -3686,6 +3700,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int is_orphan = 0; u64 last_dir_ino_rm = 0; bool can_rename = true; + bool orphanized_ancestor = false; btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); @@ -3837,9 +3852,16 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * might contain the pre-orphanization name of * ow_inode, which is no longer valid. */ - fs_path_reset(valid_path); - ret = get_cur_path(sctx, sctx->cur_ino, - sctx->cur_inode_gen, valid_path); + ret = is_ancestor(sctx->parent_root, + ow_inode, ow_gen, + sctx->cur_ino, NULL); + if (ret > 0) { + orphanized_ancestor = true; + fs_path_reset(valid_path); + ret = get_cur_path(sctx, sctx->cur_ino, + sctx->cur_inode_gen, + valid_path); + } if (ret < 0) goto out; } else { @@ -3960,6 +3982,43 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (!ret) { + /* + * If we orphanized any ancestor before, we need + * to recompute the full path for deleted names, + * since any such path was computed before we + * processed any references and orphanized any + * ancestor inode. + */ + if (orphanized_ancestor) { + struct fs_path *new_path; + + /* + * Our reference's name member points to + * its full_path member string, so we + * use here a new path. + */ + new_path = fs_path_alloc(); + if (!new_path) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, + new_path); + if (ret < 0) { + fs_path_free(new_path); + goto out; + } + ret = fs_path_add(new_path, + cur->name, + cur->name_len); + if (ret < 0) { + fs_path_free(new_path); + goto out; + } + fs_path_free(cur->full_path); + set_ref_path(cur, new_path); + } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; @@ -5184,13 +5243,19 @@ static int is_extent_unchanged(struct send_ctx *sctx, while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); - if (right_type != BTRFS_FILE_EXTENT_REG) { + if (right_type != BTRFS_FILE_EXTENT_REG && + right_type != BTRFS_FILE_EXTENT_INLINE) { ret = 0; goto out; } right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - right_len = btrfs_file_extent_num_bytes(eb, ei); + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + right_len = btrfs_file_extent_inline_len(eb, slot, ei); + right_len = PAGE_ALIGN(right_len); + } else { + right_len = btrfs_file_extent_num_bytes(eb, ei); + } right_offset = btrfs_file_extent_offset(eb, ei); right_gen = btrfs_file_extent_generation(eb, ei); @@ -5204,6 +5269,19 @@ static int is_extent_unchanged(struct send_ctx *sctx, goto out; } + /* + * We just wanted to see if when we have an inline extent, what + * follows it is a regular extent (wanted to check the above + * condition for inline extents too). This should normally not + * happen but it's possible for example when we have an inline + * compressed extent representing data with a size matching + * the page size (currently the same as sector size). + */ + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + ret = 0; + goto out; + } + left_offset_fixed = left_offset; if (key.offset < ekey->offset) { /* Fix the right offset for 2a and 7. */ @@ -6360,22 +6438,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); if (!sctx->send_buf) { - sctx->send_buf = vmalloc(sctx->send_max_size); - if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } - sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); + sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); if (!sctx->read_buf) { - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } sctx->pending_dir_moves = RB_ROOT; @@ -6384,25 +6456,19 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); if (!sctx->clone_roots) { - sctx->clone_roots = vzalloc(alloc_size); - if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); if (arg->clone_sources_count) { - clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); + clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); if (!clone_sources_tmp) { - clone_sources_tmp = vmalloc(alloc_size); - if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 72a053c9a7f0..74e47794e63f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -601,18 +601,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_alloc_start: - num = match_strdup(&args[0]); - if (num) { - mutex_lock(&info->chunk_mutex); - info->alloc_start = memparse(num, NULL); - mutex_unlock(&info->chunk_mutex); - kfree(num); - btrfs_info(info, "allocations start at %llu", - info->alloc_start); - } else { - ret = -ENOMEM; - goto out; - } + btrfs_info(info, + "option alloc_start is obsolete, ignored"); break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL @@ -1187,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1232,8 +1222,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); - if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); @@ -1716,7 +1704,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_opts = fs_info->mount_opt; unsigned long old_compress_type = fs_info->compress_type; u64 old_max_inline = fs_info->max_inline; - u64 old_alloc_start = fs_info->alloc_start; int old_thread_pool_size = fs_info->thread_pool_size; unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; @@ -1795,8 +1782,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures && - !(*flags & MS_RDONLY)) { + fs_info->num_tolerated_disk_barrier_failures) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; @@ -1856,9 +1842,6 @@ restore: fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; - mutex_lock(&fs_info->chunk_mutex); - fs_info->alloc_start = old_alloc_start; - mutex_unlock(&fs_info->chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; @@ -1899,18 +1882,15 @@ static inline void btrfs_descending_sort_devices( static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 skip_space; u64 type; u64 avail_space; - u64 used_space; u64 min_stripe_size; int min_stripes = 1, num_stripes = 1; int i = 0, nr_devices; - int ret; /* * We aren't under the device list lock, so this is racy-ish, but good @@ -1928,12 +1908,12 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, } devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), - GFP_NOFS); + GFP_KERNEL); if (!devices_info) return -ENOMEM; /* calc min stripe number for data space allocation */ - type = btrfs_get_alloc_profile(root, 1); + type = btrfs_data_alloc_profile(fs_info); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; num_stripes = nr_devices; @@ -1950,8 +1930,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, else min_stripe_size = BTRFS_STRIPE_LEN; - if (fs_info->alloc_start) - mutex_lock(&fs_devices->device_list_mutex); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (!device->in_fs_metadata || !device->bdev || @@ -1974,34 +1952,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, */ skip_space = SZ_1M; - /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start && - fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) { - rcu_read_unlock(); - skip_space = max(fs_info->alloc_start, skip_space); - - /* - * btrfs can not use the free space in - * [0, skip_space - 1], we must subtract it from the - * total. In order to implement it, we account the used - * space in this range first. - */ - ret = btrfs_account_dev_extents_size(device, 0, - skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - mutex_unlock(&fs_devices->device_list_mutex); - return ret; - } - - rcu_read_lock(); - - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; - } - /* * we can use the free space in [0, skip_space - 1], subtract * it from the total. @@ -2020,8 +1970,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, i++; } rcu_read_unlock(); - if (fs_info->alloc_start) - mutex_unlock(&fs_devices->device_list_mutex); nr_devices = i; @@ -2058,10 +2006,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, * multiplier to scale the sizes. * * Unused device space usage is based on simulating the chunk allocator - * algorithm that respects the device sizes, order of allocations and the - * 'alloc_start' value, this is a close approximation of the actual use but - * there are other factors that may change the result (like a new metadata - * chunk). + * algorithm that respects the device sizes and order of allocations. This is + * a close approximation of the actual use but there are other factors that may + * change the result (like a new metadata chunk). * * If metadata is exhausted, f_bavail will be 0. */ @@ -2244,7 +2191,7 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - fs_info->fs_frozen = 1; + set_bit(BTRFS_FS_FROZEN, &fs_info->flags); /* * We don't need a barrier here, we'll wait for any transaction that * could be in progress on other threads (and do delayed iputs that @@ -2263,7 +2210,9 @@ static int btrfs_freeze(struct super_block *sb) static int btrfs_unfreeze(struct super_block *sb) { - btrfs_sb(sb)->fs_frozen = 0; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1f157fba8940..c2d5f3580b4c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +static ssize_t quota_override_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int quota_override; + + quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); +} + +static ssize_t quota_override_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long knob; + int err; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + if (knob) + set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + else + clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + + return len; +} + +BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), BTRFS_ATTR_PTR(clone_alignment), + BTRFS_ATTR_PTR(quota_override), NULL, }; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ea272432c930..b18ab8f327a5 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) { memset(trans, 0, sizeof(*trans)); trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 133753232a94..d06b1c931d05 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize) return -ENOMEM; } - extent_io_tree_init(&tmp, &inode->i_data); + extent_io_tree_init(&tmp, inode); /* * First go through and create and mark all of our pages dirty, we pin diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 61b807de3e16..f615d59b0489 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(atomic_read(&transaction->use_count) == 0); - if (atomic_dec_and_test(&transaction->use_count)) { + WARN_ON(refcount_read(&transaction->use_count) == 0); + if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); if (transaction->delayed_refs.pending_csums) @@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) btrfs_put_block_group_trimming(cache); btrfs_put_block_group(cache); } - kmem_cache_free(btrfs_transaction_cachep, transaction); + kfree(transaction); } } @@ -207,7 +207,7 @@ loop: spin_unlock(&fs_info->trans_lock); return -EBUSY; } - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); @@ -228,7 +228,7 @@ loop: */ BUG_ON(type == TRANS_JOIN_NOLOCK); - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -238,11 +238,11 @@ loop: * someone started a transaction after we unlocked. Make sure * to redo the checks above */ - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); goto loop; } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); return -EROFS; } @@ -257,7 +257,7 @@ loop: * One for this trans handle, one so it will live on until we * commit the transaction. */ - atomic_set(&cur_trans->use_count, 2); + refcount_set(&cur_trans->use_count, 2); atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = get_seconds(); @@ -294,7 +294,7 @@ loop: spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, - fs_info->btree_inode->i_mapping); + fs_info->btree_inode); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->trans_lock); cur_trans = fs_info->running_transaction; if (cur_trans && is_transaction_blocked(cur_trans)) { - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_event(fs_info->transaction_wait, @@ -572,7 +572,6 @@ again: h->type = type; h->can_flush_pending_bgs = true; - INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); smp_mb(); @@ -744,7 +743,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = 0; break; } @@ -773,7 +772,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); break; } } @@ -917,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up_process(info->transaction_kthread); err = -EIO; } - assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { @@ -1376,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, fs_info); if (ret) goto out; - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret < 0) - goto out; ret = btrfs_qgroup_account_extents(trans, fs_info); if (ret < 0) goto out; @@ -1839,7 +1834,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, /* take transaction reference */ cur_trans = trans->transaction; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); btrfs_end_transaction(trans); @@ -1928,7 +1923,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static inline void @@ -2015,7 +2010,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&fs_info->trans_lock); - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans); @@ -2035,7 +2030,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (prev_trans->state != TRANS_STATE_COMPLETED) { - atomic_inc(&prev_trans->use_count); + refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans); @@ -2130,13 +2125,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - /* Reocrd old roots for later qgroup accounting */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } - /* * make sure none of the code above managed to slip in a * delayed item @@ -2179,6 +2167,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_free_log_root_tree(trans, fs_info); /* + * commit_fs_roots() can call btrfs_save_ino_cache(), which generates + * new delayed refs. Must handle them or qgroup can be wrong. + */ + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); + goto scrub_continue; + } + + /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ @@ -2223,7 +2222,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) switch_commit_roots(cur_trans, fs_info); - assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(fs_info); @@ -2306,7 +2304,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * it'll result in deadlock about SB_FREEZE_FS. */ if (current != fs_info->transaction_kthread && - current != fs_info->cleaner_kthread && !fs_info->fs_frozen) + current != fs_info->cleaner_kthread && + !test_bit(BTRFS_FS_FROZEN, &fs_info->flags)) btrfs_run_delayed_iputs(fs_info); return ret; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5dfb5590fff6..c55e44560103 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -18,6 +18,8 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ + +#include <linux/refcount.h> #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" @@ -49,7 +51,7 @@ struct btrfs_transaction { * transaction can end */ atomic_t num_writers; - atomic_t use_count; + refcount_t use_count; atomic_t pending_ordered; unsigned long flags; @@ -125,8 +127,6 @@ struct btrfs_trans_handle { unsigned int type; struct btrfs_root *root; struct btrfs_fs_info *fs_info; - struct seq_list delayed_ref_elem; - struct list_head qgroup_ref_list; struct list_head new_bgs; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a59674c3e69e..f20ef211a73d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1175,15 +1175,19 @@ next: return 0; } -static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index, - u64 *parent_objectid) +static int extref_get_fields(struct extent_buffer *eb, int slot, + unsigned long ref_ptr, u32 *namelen, char **name, + u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ref_ptr; *namelen = btrfs_inode_extref_name_len(eb, extref); + if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, + *namelen)) + return -EIO; + *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1198,14 +1202,19 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, return 0; } -static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index) +static int ref_get_fields(struct extent_buffer *eb, int slot, + unsigned long ref_ptr, u32 *namelen, char **name, + u64 *index) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ref_ptr; *namelen = btrfs_inode_ref_name_len(eb, ref); + if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), + *namelen)) + return -EIO; + *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1280,8 +1289,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index, &parent_objectid); + ret = extref_get_fields(eb, slot, ref_ptr, &namelen, + &name, &ref_index, &parent_objectid); /* * parent object can change from one array * item to another. @@ -1293,8 +1302,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index); + ret = ref_get_fields(eb, slot, ref_ptr, &namelen, + &name, &ref_index); } if (ret) goto out; @@ -1841,7 +1850,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, di)) + if (verify_dir_item(fs_info, eb, slot, di)) return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); @@ -2017,7 +2026,7 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, di)) { + if (verify_dir_item(fs_info, eb, slot, di)) { ret = -EIO; goto out; } @@ -2102,6 +2111,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_path *path, const u64 ino) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key search_key; struct btrfs_path *log_path; int i; @@ -2143,6 +2153,12 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; + ret = verify_dir_item(fs_info, path->nodes[0], + path->slots[0], di); + if (ret) { + ret = -EIO; + goto out; + } name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; @@ -4196,7 +4212,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; /* Need a ref to keep it from getting evicted from cache */ - atomic_inc(&em->refs); + refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); num++; @@ -4546,6 +4562,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, this_len = sizeof(*extref) + this_name_len; } + ret = btrfs_is_name_len_valid(eb, slot, name_ptr, + this_name_len); + if (!ret) { + ret = -EIO; + goto out; + } if (this_name_len > name_len) { char *new_name; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ab8a66d852f9..5eb7217738ed 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, + enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, + int mirror_num, int need_raid_map); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -237,6 +242,17 @@ static struct btrfs_device *__alloc_device(void) if (!dev) return ERR_PTR(-ENOMEM); + /* + * Preallocate a bio that's always going to be used for flushing device + * barriers and matches the device lifespan + */ + dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); + if (!dev->flush_bio) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + bio_get(dev->flush_bio); + INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->resized_list); @@ -833,6 +849,7 @@ static void __free_device(struct work_struct *work) device = container_of(work, struct btrfs_device, rcu_work); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } @@ -1008,14 +1025,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - fs_devices->rotating = 1; - fs_devices->open_devices++; if (device->writeable && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1349,15 +1365,13 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, int ret; int slot; struct extent_buffer *l; - u64 min_search_start; /* * We don't want to overwrite the superblock on the drive nor any area * used by the boot loader (grub for example), so we make sure to start * at an offset of at least 1MB. */ - min_search_start = max(fs_info->alloc_start, 1024ull * 1024); - search_start = max(search_start, min_search_start); + search_start = max_t(u64, search_start, SZ_1M); path = btrfs_alloc_path(); if (!path) @@ -2383,7 +2397,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; - device->total_bytes = i_size_read(bdev->bd_inode); + device->total_bytes = round_down(i_size_read(bdev->bd_inode), + fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; device->fs_info = fs_info; @@ -2413,16 +2428,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->fs_devices->total_devices++; fs_info->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!blk_queue_nonrot(q)) fs_info->fs_devices->rotating = 1; tmp = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, - tmp + device->total_bytes); + round_down(tmp + device->total_bytes, fs_info->sectorsize)); tmp = btrfs_super_num_devices(fs_info->super_copy); btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); @@ -2570,7 +2583,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } - name = rcu_string_strdup(device_path, GFP_NOFS); + name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { kfree(device); ret = -ENOMEM; @@ -2685,6 +2698,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, if (!device->writeable) return -EACCES; + new_size = round_down(new_size, fs_info->sectorsize); + mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); diff = new_size - device->total_bytes; @@ -2697,7 +2712,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, fs_devices = fs_info->fs_devices; - btrfs_set_super_total_bytes(super_copy, old_total + diff); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; btrfs_device_set_total_bytes(device, new_size); @@ -2795,10 +2811,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, return ret; } +static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(fs_info, "unable to find logical %llu length %llu", + logical, length); + return ERR_PTR(-EINVAL); + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, + "found a bad mapping, wanted %llu-%llu, found %llu-%llu", + logical, length, em->start, em->start + em->len); + free_extent_map(em); + return ERR_PTR(-EINVAL); + } + + /* callers are responsible for dropping em's ref. */ + return em; +} + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; u64 dev_extent_len = 0; @@ -2806,23 +2850,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em_tree = &fs_info->mapping_tree.map_tree; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em || em->start > chunk_offset || - em->start + em->len < chunk_offset) { + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - if (em) - free_extent_map(em); - return -EINVAL; + return PTR_ERR(em); } map = em->map_lookup; mutex_lock(&fs_info->chunk_mutex); @@ -2850,9 +2886,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += dev_extent_len; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); } @@ -3736,7 +3770,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) if (ret) btrfs_handle_fs_error(fs_info, ret, NULL); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } /* Non-zero return value signifies invalidity */ @@ -3755,6 +3789,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 meta_target, data_target; u64 allowed; int mixed = 0; int ret; @@ -3851,11 +3886,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); - if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < - btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < + btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { btrfs_warn(fs_info, "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", - bctl->meta.target, bctl->data.target); + meta_target, data_target); } if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { @@ -3910,7 +3950,7 @@ out: __cancel_balance(fs_info); else { kfree(bctl); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } return ret; } @@ -4000,7 +4040,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -4363,7 +4403,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); - u64 diff = old_size - new_size; + u64 diff; + + new_size = round_down(new_size, fs_info->sectorsize); + diff = old_size - new_size; if (device->is_tgtdev_for_dev_replace) return -EINVAL; @@ -4379,9 +4422,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space -= diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_sub(diff, &fs_info->free_chunk_space); } mutex_unlock(&fs_info->chunk_mutex); @@ -4492,7 +4533,8 @@ again: &fs_info->fs_devices->resized_devices); WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); /* Now btrfs_update_device() will change the on-disk size. */ @@ -4505,9 +4547,7 @@ done: btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(diff, &fs_info->free_chunk_space); mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -4785,7 +4825,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - stripe_size = div_u64(stripe_size, raid_stripe_len); + stripe_size = div64_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -4833,7 +4873,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = add_extent_mapping(em_tree, em, 0); if (!ret) { list_add_tail(&em->list, &trans->transaction->pending_chunks); - atomic_inc(&em->refs); + refcount_inc(&em->refs); } write_unlock(&em_tree->lock); if (ret) { @@ -4852,9 +4892,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); } - spin_lock(&info->free_chunk_lock); - info->free_chunk_space -= (stripe_size * map->num_stripes); - spin_unlock(&info->free_chunk_lock); + atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); @@ -4888,7 +4926,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; size_t item_size; @@ -4897,24 +4934,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %Lu len %Lu", - chunk_offset, chunk_size); - return -EINVAL; - } - - if (em->start != chunk_offset || em->len != chunk_size) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu", - chunk_offset, chunk_size, em->start, em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); @@ -5015,20 +5037,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_root *extent_root = fs_info->extent_root; u64 chunk_offset; u64 sys_chunk_offset; u64 alloc_profile; int ret; chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(extent_root, 0); + alloc_profile = btrfs_metadata_alloc_profile(fs_info); ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); if (ret) return ret; sys_chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); + alloc_profile = btrfs_system_alloc_profile(fs_info); ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); return ret; } @@ -5055,15 +5076,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int readonly = 0; int miss_ndevs = 0; int i; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); - if (!em) + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) return 1; map = em->map_lookup; @@ -5117,34 +5135,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - - /* - * We could return errors for these cases, but that could get ugly and - * we'd probably do the same thing which is just not do anything else - * and exit, so return 1 so the callers don't try to use other copies. - */ - if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, - logical+len); - return 1; - } - - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu", - logical, logical+len, em->start, - em->start + em->len); - free_extent_map(em); + em = get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + /* + * We could return errors for these cases, but that could get + * ugly and we'd probably do the same thing which is just not do + * anything else and exit, so return 1 so the callers don't try + * to use other copies. + */ return 1; - } map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) @@ -5160,7 +5163,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) free_extent_map(em); btrfs_dev_replace_lock(&fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && + fs_info->dev_replace.tgtdev) ret++; btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -5173,15 +5177,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; unsigned long len = fs_info->sectorsize; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + WARN_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); @@ -5189,20 +5189,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num) { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + WARN_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; @@ -5295,25 +5291,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) GFP_NOFS|__GFP_NOFAIL); atomic_set(&bbio->error, 0); - atomic_set(&bbio->refs, 1); + refcount_set(&bbio->refs, 1); return bbio; } void btrfs_get_bbio(struct btrfs_bio *bbio) { - WARN_ON(!atomic_read(&bbio->refs)); - atomic_inc(&bbio->refs); + WARN_ON(!refcount_read(&bbio->refs)); + refcount_inc(&bbio->refs); } void btrfs_put_bbio(struct btrfs_bio *bbio) { if (!bbio) return; - if (atomic_dec_and_test(&bbio->refs)) + if (refcount_dec_and_test(&bbio->refs)) kfree(bbio); } +/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ +/* + * Please note that, discard won't be sent to target device of device + * replace. + */ +static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + struct btrfs_bio **bbio_ret) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_bio *bbio; + u64 offset; + u64 stripe_nr; + u64 stripe_nr_end; + u64 stripe_end_offset; + u64 stripe_cnt; + u64 stripe_len; + u64 stripe_offset; + u64 num_stripes; + u32 stripe_index; + u32 factor = 0; + u32 sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + int ret = 0; + int i; + + /* discard always return a bbio */ + ASSERT(bbio_ret); + + em = get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* we don't discard raid56 yet */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EOPNOTSUPP; + goto out; + } + + offset = logical - em->start; + length = min_t(u64, em->len - offset, length); + + stripe_len = map->stripe_len; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + stripe_nr = div64_u64(offset, stripe_len); + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_nr * stripe_len; + + stripe_nr_end = round_up(offset + length, map->stripe_len); + stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); + stripe_cnt = stripe_nr_end - stripe_nr; + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + length); + /* + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ + num_stripes = 1; + stripe_index = 0; + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + num_stripes = min_t(u64, map->num_stripes, + sub_stripes * stripe_cnt); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index *= sub_stripes; + stripes_per_dev = div_u64_rem(stripe_cnt, factor, + &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = map->num_stripes; + } else { + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); + } + + bbio = alloc_btrfs_bio(num_stripes, 0); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ + if (i < sub_stripes) + bbio->stripes[i].length -= + stripe_offset; + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) + bbio->stripes[i].length -= + stripe_end_offset; + + if (i == sub_stripes - 1) + stripe_offset = 0; + } else { + bbio->stripes[i].length = length; + } + + stripe_index++; + if (stripe_index == map->num_stripes) { + stripe_index = 0; + stripe_nr++; + } + } + + *bbio_ret = bbio; + bbio->map_type = map->type; + bbio->num_stripes = num_stripes; +out: + free_extent_map(em); + return ret; +} + +/* + * In dev-replace case, for repair case (that's the only case where the mirror + * is selected explicitly when calling btrfs_map_block), blocks left of the + * left cursor can also be read from the target drive. + * + * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the + * array of stripes. + * For READ, it also needs to be supported using the same mirror number. + * + * If the requested block is not left of the left cursor, EIO is returned. This + * can happen because btrfs_num_copies() returns one more in the dev-replace + * case. + */ +static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + u64 srcdev_devid, int *mirror_num, + u64 *physical) +{ + struct btrfs_bio *bbio = NULL; + int num_stripes; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + int i; + int ret = 0; + + ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + logical, &length, &bbio, 0, 0); + if (ret) { + ASSERT(bbio == NULL); + return ret; + } + + num_stripes = bbio->num_stripes; + if (*mirror_num > num_stripes) { + /* + * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, + * that means that the requested area is not left of the left + * cursor + */ + btrfs_put_bbio(bbio); + return -EIO; + } + + /* + * process the rest of the function using the mirror_num of the source + * drive. Therefore look it up first. At the end, patch the device + * pointer to the one of the target drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add the + * mirror with the lowest physical address + */ + if (found && + physical_of_found <= bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + + btrfs_put_bbio(bbio); + + ASSERT(found); + if (!found) + return -EIO; + + *mirror_num = index_srcdev + 1; + *physical = physical_of_found; + return ret; +} + +static void handle_ops_on_dev_replace(enum btrfs_map_op op, + struct btrfs_bio **bbio_ret, + struct btrfs_dev_replace *dev_replace, + int *num_stripes_ret, int *max_errors_ret) +{ + struct btrfs_bio *bbio = *bbio_ret; + u64 srcdev_devid = dev_replace->srcdev->devid; + int tgtdev_indexes = 0; + int num_stripes = *num_stripes_ret; + int max_errors = *max_errors_ret; + int i; + + if (op == BTRFS_MAP_WRITE) { + int index_where_to_add; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk to + * the new disk takes place at run time while the filesystem is + * mounted writable, the regular write operations to the old + * disk have to be duplicated to go to the new disk as well. + * + * Note that device->missing is handled by the caller, and that + * the write to the old disk is already set up in the stripes + * array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + bbio->tgtdev_map[i] = index_where_to_add; + index_where_to_add++; + max_errors++; + tgtdev_indexes++; + } + } + num_stripes = index_where_to_add; + } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can also + * be used to read data in case it is needed to repair a corrupt + * block elsewhere. This is possible if the requested area is + * left of the left cursor. In this area, the target drive is a + * full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it simple, + * only add the mirror with the lowest physical + * address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; + + tgtdev_indexes++; + num_stripes++; + } + } + + *num_stripes_ret = num_stripes; + *max_errors_ret = max_errors; + bbio->num_tgtdevs = tgtdev_indexes; + *bbio_ret = bbio; +} + +static bool need_full_stripe(enum btrfs_map_op op) +{ + return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5322,14 +5646,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; - u64 stripe_end_offset; u64 stripe_nr; - u64 stripe_nr_orig; - u64 stripe_nr_end; u64 stripe_len; u32 stripe_index; int i; @@ -5345,23 +5664,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, *length); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu len %llu", - logical, *length); - return -EINVAL; - } + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + *length, bbio_ret); - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu, found %Lu-%Lu", - logical, em->start, em->start + em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, logical, *length); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; offset = logical - em->start; @@ -5400,14 +5709,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, raid56_full_stripe_start *= full_stripe_len; } - if (op == BTRFS_MAP_DISCARD) { - /* we don't discard raid56 yet */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = -EOPNOTSUPP; - goto out; - } - *length = min_t(u64, em->len - offset, *length); - } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { u64 max_len; /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single @@ -5438,105 +5740,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { - /* - * in dev-replace case, for repair case (that's the only - * case where the mirror is selected explicitly when - * calling btrfs_map_block), blocks left of the left cursor - * can also be read from the target drive. - * For REQ_GET_READ_MIRRORS, the target drive is added as - * the last one to the array of stripes. For READ, it also - * needs to be supported using the same mirror number. - * If the requested block is not left of the left cursor, - * EIO is returned. This can happen because btrfs_num_copies() - * returns one more in the dev-replace case. - */ - u64 tmp_length = *length; - struct btrfs_bio *tmp_bbio = NULL; - int tmp_num_stripes; - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, 0); - if (ret) { - WARN_ON(tmp_bbio != NULL); - goto out; - } - - tmp_num_stripes = tmp_bbio->num_stripes; - if (mirror_num > tmp_num_stripes) { - /* - * BTRFS_MAP_GET_READ_MIRRORS does not contain this - * mirror, that means that the requested area - * is not left of the left cursor - */ - ret = -EIO; - btrfs_put_bbio(tmp_bbio); - goto out; - } - - /* - * process the rest of the function using the mirror_num - * of the source drive. Therefore look it up first. - * At the end, patch the device pointer to the one of the - * target drive. - */ - for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) - continue; - - /* - * In case of DUP, in order to keep it simple, only add - * the mirror with the lowest physical address - */ - if (found && - physical_of_found <= tmp_bbio->stripes[i].physical) - continue; - - index_srcdev = i; - found = 1; - physical_of_found = tmp_bbio->stripes[i].physical; - } - - btrfs_put_bbio(tmp_bbio); - - if (!found) { - WARN_ON(1); - ret = -EIO; + !need_full_stripe(op) && dev_replace->tgtdev != NULL) { + ret = get_extra_mirror_from_replace(fs_info, logical, *length, + dev_replace->srcdev->devid, + &mirror_num, + &physical_to_patch_in_first_stripe); + if (ret) goto out; - } - - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; + else + patch_the_first_stripe_for_dev_replace = 1; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } num_stripes = 1; stripe_index = 0; - stripe_nr_orig = stripe_nr; - stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); - stripe_end_offset = stripe_nr_end * map->stripe_len - - (offset + *length); - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->num_stripes, - stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) + if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5549,8 +5774,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) { + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5566,10 +5790,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->sub_stripes * - (stripe_nr_end - stripe_nr_orig), - map->num_stripes); else if (mirror_num) stripe_index += mirror_num - 1; else { @@ -5587,7 +5807,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = div_u64(raid56_full_stripe_start, + stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ @@ -5612,8 +5832,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1) + if ((op != BTRFS_MAP_WRITE && + op != BTRFS_MAP_GET_READ_MIRRORS) && + mirror_num <= 1) mirror_num = 1; } } else { @@ -5635,8 +5856,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { + if (op == BTRFS_MAP_WRITE) num_alloc_stripes <<= 1; if (op == BTRFS_MAP_GET_READ_MIRRORS) num_alloc_stripes++; @@ -5648,14 +5869,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } - if (dev_replace_is_ongoing) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); /* build raid_map */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && - ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) || - mirror_num > 1)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + (need_full_stripe(op) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5679,173 +5898,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, RAID6_Q_STRIPE; } - if (op == BTRFS_MAP_DISCARD) { - u32 factor = 0; - u32 sub_stripes = 0; - u64 stripes_per_dev = 0; - u32 remaining_stripes = 0; - u32 last_stripe = 0; - - if (map->type & - (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - if (map->type & BTRFS_BLOCK_GROUP_RAID0) - sub_stripes = 1; - else - sub_stripes = map->sub_stripes; - - factor = map->num_stripes / sub_stripes; - stripes_per_dev = div_u64_rem(stripe_nr_end - - stripe_nr_orig, - factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; - } - - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * - map->stripe_len; - - if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; - /* - * Special for the first stripe and - * the last stripe: - * - * |-------|...|-------| - * |----------| - * off end_off - */ - if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; - - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; - - if (i == sub_stripes - 1) - stripe_offset = 0; - } else - bbio->stripes[i].length = *length; - - stripe_index++; - if (stripe_index == map->num_stripes) { - /* This could only happen for RAID0/10 */ - stripe_index = 0; - stripe_nr++; - } - } - } else { - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; - } + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; } - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); - tgtdev_indexes = 0; - if (dev_replace_is_ongoing && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) && - dev_replace->tgtdev != NULL) { - int index_where_to_add; - u64 srcdev_devid = dev_replace->srcdev->devid; - - /* - * duplicate the write operations while the dev replace - * procedure is running. Since the copying of the old disk - * to the new disk takes place at run time while the - * filesystem is mounted writable, the regular write - * operations to the old disk have to be duplicated to go - * to the new disk as well. - * Note that device->missing is handled by the caller, and - * that the write to the old disk is already set up in the - * stripes array. - */ - index_where_to_add = num_stripes; - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* write to new disk, too */ - struct btrfs_bio_stripe *new = - bbio->stripes + index_where_to_add; - struct btrfs_bio_stripe *old = - bbio->stripes + i; - - new->physical = old->physical; - new->length = old->length; - new->dev = dev_replace->tgtdev; - bbio->tgtdev_map[i] = index_where_to_add; - index_where_to_add++; - max_errors++; - tgtdev_indexes++; - } - } - num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && - op == BTRFS_MAP_GET_READ_MIRRORS && - dev_replace->tgtdev != NULL) { - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - /* - * During the dev-replace procedure, the target drive can - * also be used to read data in case it is needed to repair - * a corrupt block elsewhere. This is possible if the - * requested area is left of the left cursor. In this area, - * the target drive is a full copy of the source drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = bbio->stripes[i].physical; - } - } - if (found) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; - - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; - - tgtdev_indexes++; - num_stripes++; - } + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + need_full_stripe(op)) { + handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, + &max_errors); } *bbio_ret = bbio; @@ -5853,7 +5926,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; - bbio->num_tgtdevs = tgtdev_indexes; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -5886,19 +5958,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map) + struct btrfs_bio **bbio_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, - mirror_num, need_raid_map); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; struct extent_map *em; struct map_lookup *map; u64 *buf; @@ -5908,24 +5976,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 rmap_len; int i, j, nr = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_start, 1); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_err(fs_info, "couldn't find em for chunk %Lu", - chunk_start); + em = get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(em)) return -EIO; - } - if (em->start != chunk_start) { - btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu", - em->start, chunk_start); - free_extent_map(em); - return -EIO; - } map = em->map_lookup; - length = em->len; rmap_len = map->stripe_len; @@ -5949,7 +6004,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, continue; stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; @@ -5994,9 +6049,10 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (bio->bi_error) { + if (bio->bi_status) { atomic_inc(&bbio->error); - if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { + if (bio->bi_status == BLK_STS_IOERR || + bio->bi_status == BLK_STS_TARGET) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -6034,13 +6090,13 @@ static void btrfs_end_bio(struct bio *bio) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_error = 0; + bio->bi_status = 0; } btrfs_end_bbio(bbio, bio); @@ -6151,7 +6207,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; btrfs_end_bbio(bbio, bio); } } @@ -6218,10 +6274,9 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, continue; } - if (dev_nr < total_devs - 1) { - bio = btrfs_bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); /* -ENOMEM */ - } else + if (dev_nr < total_devs - 1) + bio = btrfs_bio_clone(first_bio); + else bio = first_bio; submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, @@ -6636,10 +6691,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes - - device->bytes_used; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes - device->bytes_used, + &fs_info->free_chunk_space); } ret = 0; return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 59be81206dd7..6f45fd60d15a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -74,6 +74,8 @@ struct btrfs_device { int missing; int can_discard; int is_tgtdev_for_dev_replace; + int last_flush_error; + int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -123,7 +125,6 @@ struct btrfs_device { struct list_head resized_list; /* for sending down flush barriers */ - int nobarriers; struct bio *flush_bio; struct completion flush_wait; @@ -280,6 +281,11 @@ struct btrfs_io_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; u8 *csum_allocated; btrfs_io_bio_end_io_t *end_io; + struct bvec_iter iter; + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_io_bio but relies on bio being last. + */ struct bio bio; }; @@ -298,7 +304,7 @@ struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); struct btrfs_bio { - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; u64 map_type; /* get from map_lookup->type */ @@ -400,8 +406,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map); + struct btrfs_bio **bbio_ret); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); @@ -475,7 +480,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct btrfs_mapping_tree *map_tree, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b3cbf80c5acf..2c7e53f9ff1b 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -336,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); - if (verify_dir_item(fs_info, leaf, di)) { + if (verify_dir_item(fs_info, leaf, slot, di)) { ret = -EIO; goto err; } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 135b10823c6d..c248f9286366 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -24,12 +24,13 @@ #include <linux/slab.h> #include <linux/zlib.h> #include <linux/zutil.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> +#include <linux/refcount.h> #include "compression.h" struct workspace { @@ -42,7 +43,7 @@ static void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->strm.workspace); + kvfree(workspace->strm.workspace); kfree(workspace->buf); kfree(workspace); } @@ -52,14 +53,14 @@ static struct list_head *zlib_alloc_workspace(void) struct workspace *workspace; int workspacesize; - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = vmalloc(workspacesize); - workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS); + workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -211,10 +212,7 @@ out: return ret; } -static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen) +static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; @@ -222,8 +220,12 @@ static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, char *data_in; size_t total_out = 0; unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; diff --git a/fs/buffer.c b/fs/buffer.c index 9196f2a270da..ea0e05ec2916 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,8 +49,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -179,7 +178,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost sync page write"); - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } unlock_buffer(bh); @@ -353,8 +352,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); - mapping_set_error(page->mapping, -EIO); - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); SetPageError(page); } @@ -482,8 +480,6 @@ static void __remove_assoc_queue(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); WARN_ON(!bh->b_assoc_map); - if (buffer_write_io_error(bh)) - set_bit(AS_EIO, &bh->b_assoc_map->flags); bh->b_assoc_map = NULL; } @@ -1182,6 +1178,17 @@ void mark_buffer_dirty(struct buffer_head *bh) } EXPORT_SYMBOL(mark_buffer_dirty); +void mark_buffer_write_io_error(struct buffer_head *bh) +{ + set_buffer_write_io_error(bh); + /* FIXME: do we need to set this in both places? */ + if (bh->b_page && bh->b_page->mapping) + mapping_set_error(bh->b_page->mapping, -EIO); + if (bh->b_assoc_map) + mapping_set_error(bh->b_assoc_map, -EIO); +} +EXPORT_SYMBOL(mark_buffer_write_io_error); + /* * Decrement a buffer_head's reference count. If all buffers against a page * have zero reference count, are clean and unlocked, and if the page is clean @@ -1830,7 +1837,8 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1884,7 +1892,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -2379,8 +2388,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) goto out; err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, - &page, &fsdata); + AOP_FLAG_CONT_EXPAND, &page, &fsdata); if (err) goto out; @@ -2415,9 +2423,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = PAGE_SIZE - zerofrom; - err = pagecache_write_begin(file, mapping, curpos, len, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + err = pagecache_write_begin(file, mapping, curpos, len, 0, + &page, &fsdata); if (err) goto out; zero_user(page, zerofrom, len); @@ -2449,9 +2456,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = offset - zerofrom; - err = pagecache_write_begin(file, mapping, curpos, len, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + err = pagecache_write_begin(file, mapping, curpos, len, 0, + &page, &fsdata); if (err) goto out; zero_user(page, zerofrom, len); @@ -3025,11 +3031,11 @@ EXPORT_SYMBOL(block_write_full_page); sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) { - struct buffer_head tmp; struct inode *inode = mapping->host; - tmp.b_state = 0; - tmp.b_blocknr = 0; - tmp.b_size = i_blocksize(inode); + struct buffer_head tmp = { + .b_size = i_blocksize(inode), + }; + get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } @@ -3042,7 +3048,7 @@ static void end_bio_bh_io_sync(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bio_put(bio); } @@ -3095,7 +3101,7 @@ void guard_bio_eod(int op, struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags, struct writeback_control *wbc) + enum rw_hint write_hint, struct writeback_control *wbc) { struct bio *bio; @@ -3124,13 +3130,13 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; + bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); BUG_ON(bio->bi_iter.bi_size != bh->b_size); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ guard_bio_eod(op, bio); @@ -3145,14 +3151,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, return 0; } -int _submit_bh(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags) -{ - return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL); -} -EXPORT_SYMBOL_GPL(_submit_bh); - -int submit_bh(int op, int op_flags, struct buffer_head *bh) +int submit_bh(int op, int op_flags, struct buffer_head *bh) { return submit_bh_wbc(op, op_flags, bh, 0, NULL); } @@ -3291,8 +3290,6 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = head; do { - if (buffer_write_io_error(bh) && page->mapping) - mapping_set_error(page->mapping, -EIO); if (buffer_busy(bh)) goto failed; bh = bh->b_this_page; @@ -3504,6 +3501,130 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); +/* + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. + * + * Returns the offset within the file on success, and -ENOENT otherwise. + */ +static loff_t +page_seek_hole_data(struct page *page, loff_t lastoff, int whence) +{ + loff_t offset = page_offset(page); + struct buffer_head *bh, *head; + bool seek_data = whence == SEEK_DATA; + + if (lastoff < offset) + lastoff = offset; + + bh = head = page_buffers(page); + do { + offset += bh->b_size; + if (lastoff >= offset) + continue; + + /* + * Unwritten extents that have data in the page cache covering + * them can be identified by the BH_Unwritten state flag. + * Pages with multiple buffers might have a mix of holes, data + * and unwritten extents - any buffer with valid data in it + * should have BH_Uptodate flag set on it. + */ + + if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data) + return lastoff; + + lastoff = offset; + } while ((bh = bh->b_this_page) != head); + return -ENOENT; +} + +/* + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * + * Within unwritten extents, the page cache determines which parts are holes + * and which are data: unwritten and uptodate buffer heads count as data; + * everything else counts as a hole. + * + * Returns the resulting offset on successs, and -ENOENT otherwise. + */ +loff_t +page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, + int whence) +{ + pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); + loff_t lastoff = offset; + struct pagevec pvec; + + if (length <= 0) + return -ENOENT; + + pagevec_init(&pvec, 0); + + do { + unsigned want, nr_pages, i; + + want = min_t(unsigned, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + * + * If current page offset is beyond where we've ended, + * we've found a hole. + */ + if (whence == SEEK_HOLE && + lastoff < page_offset(page)) + goto check_range; + + /* Searching done if the page index is out of range. */ + if (page->index >= end) + goto not_found; + + lock_page(page); + if (likely(page->mapping == inode->i_mapping) && + page_has_buffers(page)) { + lastoff = page_seek_hole_data(page, lastoff, whence); + if (lastoff >= 0) { + unlock_page(page); + goto check_range; + } + } + unlock_page(page); + lastoff = page_offset(page) + PAGE_SIZE; + } + + /* Searching done if fewer pages returned than wanted. */ + if (nr_pages < want) + break; + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index < end); + + /* When no page at lastoff and we are not done, we found a hole. */ + if (whence != SEEK_HOLE) + goto not_found; + +check_range: + if (lastoff < offset + length) + goto out; +not_found: + lastoff = -ENOENT; +out: + pagevec_release(&pvec); + return lastoff; +} + void __init buffer_init(void) { unsigned long nrpages; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 9bf90bcc56ac..bb3a02ca9da4 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -18,7 +18,7 @@ #include <linux/fscache-cache.h> #include <linux/timer.h> -#include <linux/wait.h> +#include <linux/wait_bit.h> #include <linux/cred.h> #include <linux/workqueue.h> #include <linux/security.h> @@ -97,7 +97,7 @@ struct cachefiles_cache { * backing file read tracking */ struct cachefiles_one_read { - wait_queue_t monitor; /* link into monitored waitqueue */ + wait_queue_entry_t monitor; /* link into monitored waitqueue */ struct page *back_page; /* backing file page we're waiting for */ struct page *netfs_page; /* netfs page we're going to fill */ struct fscache_retrieval *op; /* retrieval op covering this */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 41df8a27d7eb..3978b324cbca 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -204,7 +204,7 @@ wait_for_old_object: wait_queue_head_t *wq; signed long timeout = 60 * HZ; - wait_queue_t wait; + wait_queue_entry_t wait; bool requeue; /* if the object we're waiting for is queued for processing, diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index afbdc418966d..18d7aa61ef0f 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -21,7 +21,7 @@ * - we use this to detect read completion of backing pages * - the caller holds the waitqueue lock */ -static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, +static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, int sync, void *_key) { struct cachefiles_one_read *monitor = @@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, } /* remove from the waitqueue */ - list_del(&wait->task_list); + list_del(&wait->entry); /* move onto the action list and queue for FS-Cache thread pool */ ASSERT(monitor->op); diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 987044bca1c2..59cb307b15fb 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -131,6 +131,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) } if (new_mode != old_mode) { + newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE; ret = __ceph_setattr(inode, &newattrs); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9ecb2fd348cb..1e71e6ca5ddf 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req) bool remove_page; dout("writepages_finish %p rc %d\n", inode, rc); - if (rc < 0) + if (rc < 0) { mapping_set_error(mapping, rc); + ceph_set_error_write(ci); + } else { + ceph_clear_error_write(ci); + } /* * We lost the cache cap, need to truncate the page before @@ -703,9 +707,6 @@ static void writepages_finish(struct ceph_osd_request *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - if (rc < 0) - SetPageError(page); - ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); @@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); wr_req->r_mtime = ci->vfs_inode.i_mtime; + wr_req->r_abort_on_full = true; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 68c78be19d5b..a3ebb632294e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg) void *p; size_t extra_len; struct timespec zerotime = {0}; + struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" @@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg) ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); - /* osd_epoch_barrier (version 5) */ - ceph_encode_32(&p, 0); + /* + * osd_epoch_barrier (version 5) + * The epoch_barrier is protected osdc->lock, so READ_ONCE here in + * case it was recently changed + */ + ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); /* oldest_flush_tid (version 6) */ ceph_encode_64(&p, arg->oldest_flush_tid); @@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, first_tid = cf->tid + 1; capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); - atomic_inc(&capsnap->nref); + refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); dout("__flush_snaps %p capsnap %p tid %llu %s\n", @@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); - atomic_inc(&capsnap->nref); + refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, @@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session, p += inline_len; } + if (le16_to_cpu(msg->hdr.version) >= 5) { + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + u32 epoch_barrier; + + ceph_decode_32_safe(&p, end, epoch_barrier, bad); + ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); + } + if (le16_to_cpu(msg->hdr.version) >= 8) { u64 flush_tid; u32 caller_uid, caller_gid; - u32 osd_epoch_barrier; u32 pool_ns_len; - /* version >= 5 */ - ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad); + /* version >= 6 */ ceph_decode_64_safe(&p, end, flush_tid, bad); /* version >= 7 */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3ef11bc8d728..4e2d112c982f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p) { int i; struct ceph_fs_client *fsc = s->private; + struct ceph_mdsmap *mdsmap; if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) return 0; - seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); - seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); - seq_printf(s, "session_timeout %d\n", - fsc->mdsc->mdsmap->m_session_timeout); - seq_printf(s, "session_autoclose %d\n", - fsc->mdsc->mdsmap->m_session_autoclose); - for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { - struct ceph_entity_addr *addr = - &fsc->mdsc->mdsmap->m_info[i].addr; - int state = fsc->mdsc->mdsmap->m_info[i].state; - + mdsmap = fsc->mdsc->mdsmap; + seq_printf(s, "epoch %d\n", mdsmap->m_epoch); + seq_printf(s, "root %d\n", mdsmap->m_root); + seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); + seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); + for (i = 0; i < mdsmap->m_num_mds; i++) { + struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; + int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, ceph_pr_addr(&addr->in_addr), ceph_mds_state_name(state)); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3e9ad501addf..e071d23f6148 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_mds_client *mdsc = fsc->mdsc; int i; int err; - u32 ftype; + unsigned frag = -1; struct ceph_mds_reply_info_parsed *rinfo; dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); @@ -341,7 +341,6 @@ more: /* do we have the correct frag content buffered? */ if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; - unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -352,8 +351,11 @@ more: } if (is_hash_order(ctx->pos)) { - frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), - NULL, NULL); + /* fragtree isn't always accurate. choose frag + * based on previous reply when possible. */ + if (frag == (unsigned)-1) + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); } else { frag = fpos_frag(ctx->pos); } @@ -378,7 +380,11 @@ more: ceph_mdsc_put_request(req); return -ENOMEM; } + } else if (is_hash_order(ctx->pos)) { + req->r_args.readdir.offset_hash = + cpu_to_le32(fpos_hash(ctx->pos)); } + req->r_dir_release_cnt = fi->dir_release_count; req->r_dir_ordered_cnt = fi->dir_ordered_count; req->r_readdir_cache_idx = fi->readdir_cache_idx; @@ -476,6 +482,7 @@ more: struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; + u32 ftype; BUG_ON(rde->offset < ctx->pos); @@ -498,15 +505,17 @@ more: ctx->pos++; } + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + if (fi->next_offset > 2) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + frag = fi->frag; goto more; } /* more frags? */ if (!ceph_frag_is_rightmost(fi->frag)) { - unsigned frag = ceph_frag_next(fi->frag); + frag = ceph_frag_next(fi->frag); if (is_hash_order(ctx->pos)) { loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), fi->next_offset, true); diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e8f11fa565c5..7df550c13d7f 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -91,6 +91,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(-ESTALE); + if (inode->i_nlink == 0) { + iput(inode); + return ERR_PTR(-ESTALE); + } } return d_obtain_alias(inode); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 26cc95421cca..29308a80d66f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -13,6 +13,38 @@ #include "mds_client.h" #include "cache.h" +static __le32 ceph_flags_sys2wire(u32 flags) +{ + u32 wire_flags = 0; + + switch (flags & O_ACCMODE) { + case O_RDONLY: + wire_flags |= CEPH_O_RDONLY; + break; + case O_WRONLY: + wire_flags |= CEPH_O_WRONLY; + break; + case O_RDWR: + wire_flags |= CEPH_O_RDWR; + break; + } + +#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } + + ceph_sys2wire(O_CREAT); + ceph_sys2wire(O_EXCL); + ceph_sys2wire(O_TRUNC); + ceph_sys2wire(O_DIRECTORY); + ceph_sys2wire(O_NOFOLLOW); + +#undef ceph_sys2wire + + if (flags) + dout("unused open flags: %x", flags); + + return cpu_to_le32(wire_flags); +} + /* * Ceph file operations * @@ -74,12 +106,9 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, align = (unsigned long)(it->iov->iov_base + it->iov_offset) & (PAGE_SIZE - 1); npages = calc_pages_for(align, nbytes); - pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL); - if (!pages) { - pages = vmalloc(sizeof(*pages) * npages); - if (!pages) - return ERR_PTR(-ENOMEM); - } + pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); for (idx = 0; idx < npages; ) { size_t start; @@ -123,7 +152,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) if (IS_ERR(req)) goto out; req->r_fmode = ceph_flags_to_mode(flags); - req->r_args.open.flags = cpu_to_le32(flags); + req->r_args.open.flags = ceph_flags_sys2wire(flags); req->r_args.open.mode = cpu_to_le32(create_mode); out: return req; @@ -192,7 +221,7 @@ int ceph_renew_caps(struct inode *inode) spin_lock(&ci->i_ceph_lock); wanted = __ceph_caps_file_wanted(ci); if (__ceph_is_any_real_caps(ci) && - (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { int issued = __ceph_caps_issued(ci, NULL); spin_unlock(&ci->i_ceph_lock); dout("renew caps %p want %s issued %s updating mds_wanted\n", @@ -781,6 +810,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_callback = ceph_aio_complete_req; req->r_inode = inode; req->r_priv = aio_req; + req->r_abort_on_full = true; ret = ceph_osdc_start_request(req->r_osdc, req, false); out: @@ -1088,19 +1118,22 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, out: ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - - if (pos > i_size_read(inode)) { - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); - } - } else + if (ret != 0) { + ceph_set_error_write(ci); break; + } + + ceph_clear_error_write(ci); + pos += len; + written += len; + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } if (ret != -EOLDSNAPC && written > 0) { @@ -1306,6 +1339,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } retry_snap: + /* FIXME: not complete since it doesn't account for being at quota */ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; @@ -1327,7 +1361,8 @@ retry_snap: inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || + (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { struct ceph_snap_context *snapc; struct iov_iter data; inode_unlock(inode); @@ -1636,8 +1671,12 @@ static long ceph_fallocate(struct file *file, int mode, } size = i_size_read(inode); - if (!(mode & FALLOC_FL_KEEP_SIZE)) + if (!(mode & FALLOC_FL_KEEP_SIZE)) { endoff = offset + length; + ret = inode_newsize_ok(inode, endoff); + if (ret) + goto unlock; + } if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d3119fe3ab45..4de6cdddf059 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) return readdir_prepopulate_inodes_only(req, session); - if (rinfo->hash_order && req->r_path2) { - last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, - req->r_path2, strlen(req->r_path2)); - last_hash = ceph_frag_value(last_hash); + if (rinfo->hash_order) { + if (req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, + strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } else if (rinfo->offset_hash) { + /* mds understands offset_hash */ + WARN_ON_ONCE(req->r_readdir_offset != 2); + last_hash = le32_to_cpu(rhead->args.readdir.offset_hash); + } } if (rinfo->dir_dir && @@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && - !(rinfo->hash_order && req->r_path2)) { + !(rinfo->hash_order && last_hash)) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); @@ -2015,7 +2022,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) attr->ia_size > inode->i_size) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); - inode->i_ctime = attr->ia_ctime; ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || @@ -2037,7 +2043,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, only ? "ctime only" : "ignored"); - inode->i_ctime = attr->ia_ctime; if (only) { /* * if kernel wants to dirty ctime but nothing else, @@ -2060,7 +2065,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (dirtied) { inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); - inode->i_ctime = current_time(inode); + inode->i_ctime = attr->ia_ctime; } release &= issued; @@ -2078,6 +2083,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; + req->r_stamp = attr->ia_ctime; err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c681762d76e6..0c05df44cc6c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH); } if (num == 0) goto done; @@ -378,9 +379,9 @@ const char *ceph_session_state_name(int s) static struct ceph_mds_session *get_session(struct ceph_mds_session *s) { - if (atomic_inc_not_zero(&s->s_ref)) { + if (refcount_inc_not_zero(&s->s_ref)) { dout("mdsc get_session %p %d -> %d\n", s, - atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); + refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); return s; } else { dout("mdsc get_session %p 0 -- FAIL", s); @@ -391,8 +392,8 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s) void ceph_put_mds_session(struct ceph_mds_session *s) { dout("mdsc put_session %p %d -> %d\n", s, - atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); - if (atomic_dec_and_test(&s->s_ref)) { + refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); + if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); kfree(s); @@ -411,7 +412,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, return NULL; session = mdsc->sessions[mds]; dout("lookup_mds_session %p %d\n", session, - atomic_read(&session->s_ref)); + refcount_read(&session->s_ref)); get_session(session); return session; } @@ -441,7 +442,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; - if (mds >= mdsc->mdsmap->m_max_mds) + if (mds >= mdsc->mdsmap->m_num_mds) return ERR_PTR(-EINVAL); s = kzalloc(sizeof(*s), GFP_NOFS); @@ -466,7 +467,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_caps); s->s_nr_caps = 0; s->s_trim_caps = 0; - atomic_set(&s->s_ref, 1); + refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); s->s_num_cap_releases = 0; @@ -494,7 +495,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, } mdsc->sessions[mds] = s; atomic_inc(&mdsc->num_sessions); - atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); @@ -1004,7 +1005,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *ts; int i, mds = session->s_mds; - if (mds >= mdsc->mdsmap->m_max_mds) + if (mds >= mdsc->mdsmap->m_num_mds) return; mi = &mdsc->mdsmap->m_info[mds]; @@ -1551,9 +1552,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; struct ceph_cap *cap; LIST_HEAD(tmp_list); int num_cap_releases; + __le32 barrier, *cap_barrier; + + down_read(&osdc->lock); + barrier = cpu_to_le32(osdc->epoch_barrier); + up_read(&osdc->lock); spin_lock(&session->s_cap_lock); again: @@ -1571,7 +1578,11 @@ again: head = msg->front.iov_base; head->num = cpu_to_le32(0); msg->front.iov_len = sizeof(*head); + + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); } + cap = list_first_entry(&tmp_list, struct ceph_cap, session_caps); list_del(&cap->session_caps); @@ -1589,6 +1600,11 @@ again: ceph_put_cap(mdsc, cap); if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); @@ -1604,6 +1620,11 @@ again: spin_unlock(&session->s_cap_lock); if (msg) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); @@ -1684,7 +1705,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); - req->r_stamp = current_fs_time(mdsc->fsc->sb); + req->r_stamp = timespec_trunc(current_kernel_time(), mdsc->fsc->sb->s_time_gran); req->r_op = op; req->r_direct_mode = mode; @@ -1991,7 +2012,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_pagelist) { struct ceph_pagelist *pagelist = req->r_pagelist; - atomic_inc(&pagelist->refcnt); + refcount_inc(&pagelist->refcnt); ceph_msg_data_add_pagelist(msg, pagelist); msg->hdr.data_len = cpu_to_le32(pagelist->length); } else { @@ -2638,8 +2659,10 @@ static void handle_session(struct ceph_mds_session *session, seq = le64_to_cpu(h->seq); mutex_lock(&mdsc->mutex); - if (op == CEPH_SESSION_CLOSE) + if (op == CEPH_SESSION_CLOSE) { + get_session(session); __unregister_session(mdsc, session); + } /* FIXME: this ttl calculation is generous */ session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; mutex_unlock(&mdsc->mutex); @@ -2728,6 +2751,8 @@ static void handle_session(struct ceph_mds_session *session, kick_requests(mdsc, mds); mutex_unlock(&mdsc->mutex); } + if (op == CEPH_SESSION_CLOSE) + ceph_put_mds_session(session); return; bad: @@ -3107,7 +3132,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, dout("check_new_map new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); - for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { if (mdsc->sessions[i] == NULL) continue; s = mdsc->sessions[i]; @@ -3121,15 +3146,33 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_session_state_name(s->s_state)); - if (i >= newmap->m_max_mds || + if (i >= newmap->m_num_mds || memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { if (s->s_state == CEPH_MDS_SESSION_OPENING) { /* the session never opened, just close it * out now */ + get_session(s); + __unregister_session(mdsc, s); __wake_requests(mdsc, &s->s_waiting); + ceph_put_mds_session(s); + } else if (i >= newmap->m_num_mds) { + /* force close session for stopped mds */ + get_session(s); __unregister_session(mdsc, s); + __wake_requests(mdsc, &s->s_waiting); + kick_requests(mdsc, i); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + cleanup_session_requests(mdsc, s); + remove_session_caps(s); + mutex_unlock(&s->s_mutex); + + ceph_put_mds_session(s); + + mutex_lock(&mdsc->mutex); } else { /* just close it */ mutex_unlock(&mdsc->mutex); @@ -3167,7 +3210,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } } - for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) continue; @@ -3881,7 +3924,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con) struct ceph_mds_session *s = con->private; if (get_session(s)) { - dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); + dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref)); return con; } dout("mdsc con_get %p FAIL\n", s); @@ -3892,7 +3935,7 @@ static void con_put(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; - dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); + dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1); ceph_put_mds_session(s); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ac0475a2daa7..db57ae98ed34 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -7,6 +7,7 @@ #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/spinlock.h> +#include <linux/refcount.h> #include <linux/ceph/types.h> #include <linux/ceph/messenger.h> @@ -82,9 +83,10 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - bool dir_complete; bool dir_end; + bool dir_complete; bool hash_order; + bool offset_hash; struct ceph_mds_reply_dir_entry *dir_entries; }; @@ -104,10 +106,13 @@ struct ceph_mds_reply_info_parsed { /* * cap releases are batched and sent to the MDS en masse. + * + * Account for per-message overhead of mds_cap_release header + * and __le32 for osd epoch barrier trailing field. */ -#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \ +#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ sizeof(struct ceph_mds_cap_release)) / \ - sizeof(struct ceph_mds_cap_item)) + sizeof(struct ceph_mds_cap_item)) /* @@ -156,7 +161,7 @@ struct ceph_mds_session { unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; - atomic_t s_ref; + refcount_t s_ref; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ }; @@ -373,7 +378,7 @@ __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); static inline struct ceph_mds_session * ceph_get_mds_session(struct ceph_mds_session *s) { - atomic_inc(&s->s_ref); + refcount_inc(&s->s_ref); return s; } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 5454e2327a5f..1a748cf88535 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -22,11 +22,11 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) int i; /* special case for one mds */ - if (1 == m->m_max_mds && m->m_info[0].state > 0) + if (1 == m->m_num_mds && m->m_info[0].state > 0) return 0; /* count */ - for (i = 0; i < m->m_max_mds; i++) + for (i = 0; i < m->m_num_mds; i++) if (m->m_info[i].state > 0) n++; if (n == 0) @@ -135,8 +135,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_session_autoclose = ceph_decode_32(p); m->m_max_file_size = ceph_decode_64(p); m->m_max_mds = ceph_decode_32(p); + m->m_num_mds = m->m_max_mds; - m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS); + m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); if (m->m_info == NULL) goto nomem; @@ -207,9 +208,20 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds < 0 || mds >= m->m_max_mds || state <= 0) + if (mds < 0 || state <= 0) continue; + if (mds >= m->m_num_mds) { + int new_num = max(mds + 1, m->m_num_mds * 2); + void *new_m_info = krealloc(m->m_info, + new_num * sizeof(*m->m_info), + GFP_NOFS | __GFP_ZERO); + if (!new_m_info) + goto nomem; + m->m_info = new_m_info; + m->m_num_mds = new_num; + } + info = &m->m_info[mds]; info->global_id = global_id; info->state = state; @@ -229,6 +241,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->export_targets = NULL; } } + if (m->m_num_mds > m->m_max_mds) { + /* find max up mds */ + for (i = m->m_num_mds; i >= m->m_max_mds; i--) { + if (i == 0 || m->m_info[i-1].state > 0) + break; + } + m->m_num_mds = i; + } /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); @@ -270,12 +290,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) for (i = 0; i < n; i++) { s32 mds = ceph_decode_32(p); - if (mds >= 0 && mds < m->m_max_mds) { + if (mds >= 0 && mds < m->m_num_mds) { if (m->m_info[mds].laggy) num_laggy++; } } m->m_num_laggy = num_laggy; + + if (n > m->m_num_mds) { + void *new_m_info = krealloc(m->m_info, + n * sizeof(*m->m_info), + GFP_NOFS | __GFP_ZERO); + if (!new_m_info) + goto nomem; + m->m_info = new_m_info; + } + m->m_num_mds = n; } /* inc */ @@ -341,7 +371,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->m_max_mds; i++) + for (i = 0; i < m->m_num_mds; i++) kfree(m->m_info[i].export_targets); kfree(m->m_info); kfree(m->m_data_pg_pools); @@ -357,7 +387,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) return false; if (m->m_num_laggy > 0) return false; - for (i = 0; i < m->m_max_mds; i++) { + for (i = 0; i < m->m_num_mds; i++) { if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) nr_active++; } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 8f8b41c2ef0f..dab5d6732345 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -519,7 +519,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->need_flush ? "" : "no_flush"); ihold(inode); - atomic_set(&capsnap->nref, 1); + refcount_set(&capsnap->nref, 1); INIT_LIST_HEAD(&capsnap->ci_item); capsnap->follows = old_snapc->seq; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a8c81b2052ca..8d7918ce694a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -544,10 +544,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const u64 supported_features = - CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | - CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; - const u64 required_features = 0; int page_count; size_t size; int err = -ENOMEM; @@ -556,8 +552,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc) return ERR_PTR(-ENOMEM); - fsc->client = ceph_create_client(opt, fsc, supported_features, - required_features); + fsc->client = ceph_create_client(opt, fsc); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 176186b12457..a973acd8beaf 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -14,6 +14,7 @@ #include <linux/writeback.h> #include <linux/slab.h> #include <linux/posix_acl.h> +#include <linux/refcount.h> #include <linux/ceph/libceph.h> @@ -160,7 +161,7 @@ struct ceph_cap_flush { * data before flushing the snapped state (tracked here) back to the MDS. */ struct ceph_cap_snap { - atomic_t nref; + refcount_t nref; struct list_head ci_item; struct ceph_cap_flush cap_flush; @@ -189,7 +190,7 @@ struct ceph_cap_snap { static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { - if (atomic_dec_and_test(&capsnap->nref)) { + if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); kfree(capsnap); @@ -471,6 +472,32 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ + +/* + * We set the ERROR_WRITE bit when we start seeing write errors on an inode + * and then clear it when they start succeeding. Note that we do a lockless + * check first, and only take the lock if it looks like it needs to be changed. + * The write submission code just takes this as a hint, so we're not too + * worried if a few slip through in either direction. + */ +static inline void ceph_set_error_write(struct ceph_inode_info *ci) +{ + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} + +static inline void ceph_clear_error_write(struct ceph_inode_info *ci) +{ + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index febc28f9e2c2..75267cdd5dfd 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -392,6 +392,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (update_xattr) { int err = 0; + if (xattr && (flags & XATTR_CREATE)) err = -EEXIST; else if (!xattr && (flags & XATTR_REPLACE)) @@ -399,12 +400,14 @@ static int __set_xattr(struct ceph_inode_info *ci, if (err) { kfree(name); kfree(val); + kfree(*newxattr); return err; } if (update_xattr < 0) { if (xattr) __remove_xattr(ci, xattr); kfree(name); + kfree(*newxattr); return 0; } } diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 034f00f21390..afeefe79c25e 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -146,6 +146,15 @@ config CIFS_DEBUG2 option can be turned off unless you are debugging cifs problems. If unsure, say N. +config CIFS_DEBUG_DUMP_KEYS + bool "Dump encryption keys for offline decryption (Unsafe)" + depends on CIFS_DEBUG && CIFS_SMB2 + help + Enabling this will dump the encryption and decryption keys + used to communicate on an encrypted share connection on the + console. This allows Wireshark to decrypt and dissect + encrypted network captures. Enable this carefully. + config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS && KEYS diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index a0b3e7d1be48..e0445e2075b2 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -79,6 +79,10 @@ convert_sfu_char(const __u16 src_char, char *target) static bool convert_sfm_char(const __u16 src_char, char *target) { + if (src_char >= 0xF001 && src_char <= 0xF01F) { + *target = src_char - 0xF000; + return true; + } switch (src_char) { case SFM_COLON: *target = ':'; @@ -417,6 +421,10 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string) { __le16 dest_char; + if (src_char >= 0x01 && src_char <= 0x1F) { + dest_char = cpu_to_le16(src_char + 0xF000); + return dest_char; + } switch (src_char) { case ':': dest_char = cpu_to_le16(SFM_COLON); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 15bac390dff9..b98436f5c7c7 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1135,20 +1135,19 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, u32 acllen = 0; int rc = 0; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); - struct cifs_tcon *tcon; + struct smb_version_operations *ops; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); if (IS_ERR(tlink)) return PTR_ERR(tlink); - tcon = tlink_tcon(tlink); - if (pfid && (tcon->ses->server->ops->get_acl_by_fid)) - pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid, - &acllen); - else if (tcon->ses->server->ops->get_acl) - pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, - &acllen); + ops = tlink_tcon(tlink)->ses->server->ops; + + if (pfid && (ops->get_acl_by_fid)) + pntsd = ops->get_acl_by_fid(cifs_sb, pfid, &acllen); + else if (ops->get_acl) + pntsd = ops->get_acl(cifs_sb, inode, path, &acllen); else { cifs_put_tlink(tlink); return -EOPNOTSUPP; @@ -1181,23 +1180,23 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); - struct cifs_tcon *tcon; + struct smb_version_operations *ops; if (IS_ERR(tlink)) return PTR_ERR(tlink); - tcon = tlink_tcon(tlink); + + ops = tlink_tcon(tlink)->ses->server->ops; cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ - if (tcon->ses->server->ops->get_acl == NULL) { + if (ops->get_acl == NULL) { cifs_put_tlink(tlink); return -EOPNOTSUPP; } - pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, - &secdesclen); + pntsd = ops->get_acl(cifs_sb, inode, path, &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); @@ -1224,13 +1223,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); - if (tcon->ses->server->ops->set_acl == NULL) + if (ops->set_acl == NULL) rc = -EOPNOTSUPP; if (!rc) { /* Set the security descriptor */ - rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode, - path, aclflag); + rc = ops->set_acl(pnntsd, secdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } cifs_put_tlink(tlink); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 058ac9b36f04..68abbb0db608 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -478,6 +478,7 @@ find_timestamp(struct cifs_ses *ses) unsigned char *blobptr; unsigned char *blobend; struct ntlmssp2_name *attrptr; + struct timespec ts; if (!ses->auth_key.len || !ses->auth_key.response) return 0; @@ -502,7 +503,8 @@ find_timestamp(struct cifs_ses *ses) blobptr += attrsize; /* advance attr value */ } - return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + ktime_get_real_ts(&ts); + return cpu_to_le64(cifs_UnixTimeToNT(ts)); } static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8be55be70faf..bcc7d9acad64 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -418,7 +418,7 @@ struct smb_version_operations { int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, const unsigned char *, const unsigned char *, char *, - size_t, const struct nls_table *, int); + size_t, struct cifs_sb_info *); int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, const char *, const void *, const __u16, const struct nls_table *, int); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e49958c3f8bb..6eb3147132e3 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -480,8 +480,7 @@ extern int CIFSSMBCopy(unsigned int xid, extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, const unsigned char *ea_name, char *EAData, - size_t bufsize, const struct nls_table *nls_codepage, - int remap_special_chars); + size_t bufsize, struct cifs_sb_info *cifs_sb); extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const char *ea_name, const void *ea_value, const __u16 ea_value_len, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 205fd94f52fd..fbb0d4cbda41 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -478,14 +478,14 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) * this requirement. */ int val, seconds, remain, result; - struct timespec ts, utc; - utc = CURRENT_TIME; + struct timespec ts; + unsigned long utc = ktime_get_real_seconds(); ts = cnvrtDosUnixTm(rsp->SrvTime.Date, rsp->SrvTime.Time, 0); cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n", - (int)ts.tv_sec, (int)utc.tv_sec, - (int)(utc.tv_sec - ts.tv_sec)); - val = (int)(utc.tv_sec - ts.tv_sec); + (int)ts.tv_sec, (int)utc, + (int)(utc - ts.tv_sec)); + val = (int)(utc - ts.tv_sec); seconds = abs(val); result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; remain = seconds % MIN_TZ_ADJ; @@ -697,9 +697,7 @@ cifs_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, 1, CIFS_ECHO_OP); } @@ -1599,9 +1597,7 @@ cifs_readv_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &rdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, 1, 0); } @@ -2058,7 +2054,6 @@ cifs_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; @@ -2095,9 +2090,7 @@ cifs_writev_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &wdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, 1, 0); } @@ -6076,11 +6069,13 @@ ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, const unsigned char *ea_name, char *EAData, size_t buf_size, - const struct nls_table *nls_codepage, int remap) + struct cifs_sb_info *cifs_sb) { /* BB assumes one setup word */ TRANSACTION2_QPI_REQ *pSMB = NULL; TRANSACTION2_QPI_RSP *pSMBr = NULL; + int remap = cifs_remap(cifs_sb); + struct nls_table *nls_codepage = cifs_sb->local_nls; int rc = 0; int bytes_returned; int list_len; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6ef78ad838e6..bc09df6b473a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -582,7 +582,7 @@ cifs_relock_file(struct cifsFileInfo *cfile) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; - down_read(&cinode->lock_sem); + down_read_nested(&cinode->lock_sem, SINGLE_DEPTH_NESTING); if (cinode->can_cache_brlcks) { /* can cache locks - no need to relock */ up_read(&cinode->lock_sem); @@ -2234,14 +2234,16 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc) set_page_writeback(page); retry_write: rc = cifs_partialpagewrite(page, 0, PAGE_SIZE); - if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL) - goto retry_write; - else if (rc == -EAGAIN) + if (rc == -EAGAIN) { + if (wbc->sync_mode == WB_SYNC_ALL) + goto retry_write; redirty_page_for_writepage(wbc, page); - else if (rc != 0) + } else if (rc != 0) { SetPageError(page); - else + mapping_set_error(page->mapping, rc); + } else { SetPageUptodate(page); + } end_page_writeback(page); put_page(page); free_xid(xid); @@ -2810,12 +2812,12 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; ssize_t rc; + inode_lock(inode); /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); - inode_lock(inode); rc = generic_write_checks(iocb, from); if (rc <= 0) @@ -2828,11 +2830,11 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) else rc = -EACCES; out: + up_read(&cinode->lock_sem); inode_unlock(inode); if (rc > 0) rc = generic_write_sync(iocb, rc); - up_read(&cinode->lock_sem); return rc; } @@ -3271,7 +3273,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (to->type & ITER_IOVEC) + if (to->type == ITER_IOVEC) ctx->should_dirty = true; rc = setup_aio_ctx_iter(ctx, to, READ); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b261db34103c..a8693632235f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -24,6 +24,7 @@ #include <linux/pagemap.h> #include <linux/freezer.h> #include <linux/sched/signal.h> +#include <linux/wait_bit.h> #include <asm/div64.h> #include "cifsfs.h" @@ -322,9 +323,9 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; - fattr->cf_atime = CURRENT_TIME; - fattr->cf_ctime = CURRENT_TIME; - fattr->cf_mtime = CURRENT_TIME; + ktime_get_real_ts(&fattr->cf_mtime); + fattr->cf_mtime = timespec_trunc(fattr->cf_mtime, sb->s_time_gran); + fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime; fattr->cf_nlink = 2; fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; } @@ -563,8 +564,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, "SETFILEBITS", ea_value, 4 /* size of buf */, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); + cifs_sb); cifs_put_tlink(tlink); if (rc < 0) return (int)rc; @@ -586,9 +586,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, - struct cifs_sb_info *cifs_sb, bool adjust_tz, + struct super_block *sb, bool adjust_tz, bool symlink) { + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); memset(fattr, 0, sizeof(*fattr)); @@ -598,8 +599,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, if (info->LastAccessTime) fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); - else - fattr->cf_atime = CURRENT_TIME; + else { + ktime_get_real_ts(&fattr->cf_atime); + fattr->cf_atime = timespec_trunc(fattr->cf_atime, sb->s_time_gran); + } fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); @@ -659,7 +662,6 @@ cifs_get_file_info(struct file *filp) FILE_ALL_INFO find_data; struct cifs_fattr fattr; struct inode *inode = file_inode(filp); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -671,7 +673,7 @@ cifs_get_file_info(struct file *filp) rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); switch (rc) { case 0: - cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false, + cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, false); break; case -EREMOTE: @@ -753,7 +755,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } if (!rc) { - cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz, + cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, sb); @@ -1363,9 +1365,9 @@ out_reval: cifs_inode = CIFS_I(inode); cifs_inode->time = 0; /* will force revalidate to get info when needed */ - inode->i_ctime = current_fs_time(sb); + inode->i_ctime = current_time(inode); } - dir->i_ctime = dir->i_mtime = current_fs_time(sb); + dir->i_ctime = dir->i_mtime = current_time(dir); cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: @@ -1633,7 +1635,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifsInode->time = 0; d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = - current_fs_time(inode->i_sb); + current_time(inode); rmdir_exit: kfree(full_path); @@ -1806,7 +1808,7 @@ unlink_target: CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = - target_dir->i_mtime = current_fs_time(source_dir->i_sb); + target_dir->i_mtime = current_time(source_dir); cifs_rename_exit: kfree(info_buf_source); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index b08531977daa..3b147dc6af63 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -810,7 +810,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) if (!pages) { pages = vmalloc(max_pages * sizeof(struct page *)); - if (!bv) { + if (!pages) { kvfree(bv); return -ENOMEM; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 27bc360c7ffd..a723df3e0197 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid, __u16 search_flags, struct cifs_search_info *srch_inf) { - return CIFSFindFirst(xid, tcon, path, cifs_sb, - &fid->netfid, search_flags, srch_inf, true); + int rc; + + rc = CIFSFindFirst(xid, tcon, path, cifs_sb, + &fid->netfid, search_flags, srch_inf, true); + if (rc) + cifs_dbg(FYI, "find first failed=%d\n", rc); + return rc; } static int diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c58691834eb2..ccbb397debbc 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -982,7 +982,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); kfree(utf16_path); if (rc) { - cifs_dbg(VFS, "open dir failed\n"); + cifs_dbg(FYI, "open dir failed rc=%d\n", rc); return rc; } @@ -992,7 +992,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); if (rc) { - cifs_dbg(VFS, "query directory failed\n"); + cifs_dbg(FYI, "query directory failed rc=%d\n", rc); SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } return rc; @@ -1288,6 +1288,108 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +#ifdef CONFIG_CIFS_ACL +static struct cifs_ntsd * +get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, + const struct cifs_fid *cifsfid, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + unsigned int xid; + int rc = -EOPNOTSUPP; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + + xid = get_xid(); + cifs_dbg(FYI, "trying to get acl\n"); + + rc = SMB2_query_acl(xid, tlink_tcon(tlink), cifsfid->persistent_fid, + cifsfid->volatile_fid, (void **)&pntsd, pacllen); + free_xid(xid); + + cifs_put_tlink(tlink); + + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); + return pntsd; + +} + +static struct cifs_ntsd * +get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, + const char *path, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + unsigned int xid; + int rc; + struct cifs_tcon *tcon; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_fid fid; + struct cifs_open_parms oparms; + __le16 *utf16_path; + + cifs_dbg(FYI, "get smb3 acl for path %s\n", path); + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + + tcon = tlink_tcon(tlink); + xid = get_xid(); + + if (backup_cred(cifs_sb)) + oparms.create_options = CREATE_OPEN_BACKUP_INTENT; + else + oparms.create_options = 0; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return ERR_PTR(-ENOMEM); + + oparms.tcon = tcon; + oparms.desired_access = READ_CONTROL; + oparms.disposition = FILE_OPEN; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + kfree(utf16_path); + if (!rc) { + rc = SMB2_query_acl(xid, tlink_tcon(tlink), fid.persistent_fid, + fid.volatile_fid, (void **)&pntsd, pacllen); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + } + + cifs_put_tlink(tlink); + free_xid(xid); + + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); + return pntsd; +} + +/* Retrieve an ACL from the server */ +static struct cifs_ntsd * +get_smb2_acl(struct cifs_sb_info *cifs_sb, + struct inode *inode, const char *path, + u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + struct cifsFileInfo *open_file = NULL; + + if (inode) + open_file = find_readable_file(CIFS_I(inode), true); + if (!open_file) + return get_smb2_acl_by_path(cifs_sb, path, pacllen); + + pntsd = get_smb2_acl_by_fid(cifs_sb, &open_file->fid, pacllen); + cifsFileInfo_put(open_file); + return pntsd; +} +#endif + static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len, bool keep_size) { @@ -1809,7 +1911,8 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) sg = init_sg(rqst, sign); if (!sg) { - cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc); + cifs_dbg(VFS, "%s: Failed to init sg", __func__); + rc = -ENOMEM; goto free_req; } @@ -1817,6 +1920,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) iv = kzalloc(iv_len, GFP_KERNEL); if (!iv) { cifs_dbg(VFS, "%s: Failed to alloc IV", __func__); + rc = -ENOMEM; goto free_sg; } iv[0] = 3; @@ -2391,6 +2495,11 @@ struct smb_version_operations smb20_operations = { .dir_needs_close = smb2_dir_needs_close, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_ACL + .get_acl = get_smb2_acl, + .get_acl_by_fid = get_smb2_acl_by_fid, +/* .set_acl = set_smb3_acl, */ +#endif /* CIFS_ACL */ }; struct smb_version_operations smb21_operations = { @@ -2475,6 +2584,11 @@ struct smb_version_operations smb21_operations = { .enum_snapshots = smb3_enum_snapshots, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_ACL + .get_acl = get_smb2_acl, + .get_acl_by_fid = get_smb2_acl_by_fid, +/* .set_acl = set_smb3_acl, */ +#endif /* CIFS_ACL */ }; struct smb_version_operations smb30_operations = { @@ -2569,6 +2683,11 @@ struct smb_version_operations smb30_operations = { .receive_transform = smb3_receive_transform, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_ACL + .get_acl = get_smb2_acl, + .get_acl_by_fid = get_smb2_acl_by_fid, +/* .set_acl = set_smb3_acl, */ +#endif /* CIFS_ACL */ }; #ifdef CONFIG_CIFS_SMB311 @@ -2751,7 +2870,7 @@ struct smb_version_values smb302_values = { struct smb_version_values smb311_values = { .version_string = SMB311_VERSION_STRING, .protocol_id = SMB311_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 48ff7703b919..4938e8b6d32f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1240,15 +1240,19 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, goto tcon_exit; } - if (rsp->ShareType & SMB2_SHARE_TYPE_DISK) + switch (rsp->ShareType) { + case SMB2_SHARE_TYPE_DISK: cifs_dbg(FYI, "connection to disk share\n"); - else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) { + break; + case SMB2_SHARE_TYPE_PIPE: tcon->ipc = true; cifs_dbg(FYI, "connection to pipe share\n"); - } else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) { - tcon->print = true; + break; + case SMB2_SHARE_TYPE_PRINT: + tcon->ipc = true; cifs_dbg(FYI, "connection to printer\n"); - } else { + break; + default: cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType); rc = -EOPNOTSUPP; goto tcon_error_exit; @@ -2077,8 +2081,9 @@ validate_and_copy_buf(unsigned int offset, unsigned int buffer_length, static int query_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, u8 info_class, - size_t output_len, size_t min_len, void *data) + u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type, + u32 additional_info, size_t output_len, size_t min_len, void **data, + u32 *dlen) { struct smb2_query_info_req *req; struct smb2_query_info_rsp *rsp = NULL; @@ -2104,10 +2109,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->InfoType = SMB2_O_INFO_FILE; + req->InfoType = info_type; req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; + req->AdditionalInformation = cpu_to_le32(additional_info); /* 4 for rfc1002 length field and 1 for Buffer */ req->InputBufferOffset = cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); @@ -2126,24 +2132,51 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, goto qinf_exit; } + if (dlen) { + *dlen = le32_to_cpu(rsp->OutputBufferLength); + if (!*data) { + *data = kmalloc(*dlen, GFP_KERNEL); + if (!*data) { + cifs_dbg(VFS, + "Error %d allocating memory for acl\n", + rc); + *dlen = 0; + goto qinf_exit; + } + } + } + rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp->hdr, min_len, data); + &rsp->hdr, min_len, *data); qinf_exit: free_rsp_buf(resp_buftype, rsp); return rc; } +int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data) +{ + return query_info(xid, tcon, persistent_fid, volatile_fid, + FILE_ALL_INFORMATION, SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + PATH_MAX * 2, + sizeof(struct smb2_file_all_info), (void **)&data, + NULL); +} + int -SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, +SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, - struct smb2_file_all_info *data) + void **data, u32 *plen) { + __u32 additional_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; + *plen = 0; + return query_info(xid, tcon, persistent_fid, volatile_fid, - FILE_ALL_INFORMATION, - sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - sizeof(struct smb2_file_all_info), data); + 0, SMB2_O_INFO_SECURITY, additional_info, + SMB2_MAX_BUFFER_SIZE, + sizeof(struct smb2_file_all_info), data, plen); } int @@ -2151,9 +2184,10 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid) { return query_info(xid, tcon, persistent_fid, volatile_fid, - FILE_INTERNAL_INFORMATION, + FILE_INTERNAL_INFORMATION, SMB2_O_INFO_FILE, 0, sizeof(struct smb2_file_internal_info), - sizeof(struct smb2_file_internal_info), uniqueid); + sizeof(struct smb2_file_internal_info), + (void **)&uniqueid, NULL); } /* @@ -2173,9 +2207,7 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED) credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, CIFS_ECHO_OP); } @@ -2433,9 +2465,7 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_READ_HE); queue_work(cifsiod_wq, &rdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, 0); } @@ -2594,7 +2624,6 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; unsigned int credits_received = 1; @@ -2634,9 +2663,7 @@ smb2_writev_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); queue_work(cifsiod_wq, &wdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, credits_received, 0); } diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 6853454fc871..3595cd755147 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -135,6 +135,9 @@ extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct smb2_file_all_info *data); +extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, + void **data, unsigned int *plen); extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index c69ec96e92ac..67367cf1f8cd 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -335,9 +335,31 @@ generate_smb3signingkey(struct cifs_ses *ses, if (rc) return rc; - return generate_key(ses, ptriplet->decryption.label, - ptriplet->decryption.context, - ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + rc = generate_key(ses, ptriplet->decryption.label, + ptriplet->decryption.context, + ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + + if (rc) + return rc; + +#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS + cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__); + /* + * The session id is opaque in terms of endianness, so we can't + * print it as a long long. we dump it as we got it on the wire + */ + cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid), + &ses->Suid); + cifs_dbg(VFS, "Session Key %*ph\n", + SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response); + cifs_dbg(VFS, "Signing Key %*ph\n", + SMB3_SIGN_KEY_SIZE, ses->smb3signingkey); + cifs_dbg(VFS, "ServerIn Key %*ph\n", + SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey); + cifs_dbg(VFS, "ServerOut Key %*ph\n", + SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey); +#endif + return rc; } int diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 4d64b5b8fc9c..7efbab013957 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -94,7 +94,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) now = jiffies; /* commands taking longer than one second are indications that something is wrong, unless it is quite a slow link or server */ - if ((now - midEntry->when_alloc) > HZ) { + if (time_after(now, midEntry->when_alloc + HZ)) { if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) { pr_debug(" CIFS slow rsp: cmd %d mid %llu", midEntry->command, midEntry->mid); @@ -536,11 +536,14 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, list_add_tail(&mid->qhead, &server->pending_mid_q); spin_unlock(&GlobalMid_Lock); - + /* + * Need to store the time in mid before calling I/O. For call_async, + * I/O response may come back and free the mid entry on another thread. + */ + cifs_save_when_sent(mid); cifs_in_send_inc(server); rc = smb_send_rqst(server, rqst, flags); cifs_in_send_dec(server); - cifs_save_when_sent(mid); if (rc < 0) { server->sequence_number -= 2; @@ -613,9 +616,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); return rc; } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 20af5187ba63..de50e749ff05 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -188,8 +188,6 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, pcreatetime = (__u64 *)value; *pcreatetime = CIFS_I(inode)->createtime; return sizeof(__u64); - - return rc; } @@ -235,8 +233,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, - full_path, name, value, size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + full_path, name, value, size, cifs_sb); break; case XATTR_CIFS_ACL: { @@ -336,8 +333,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, - full_path, NULL, data, buf_size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + full_path, NULL, data, buf_size, cifs_sb); list_ea_exit: kfree(full_path); free_xid(xid); diff --git a/fs/coda/file.c b/fs/coda/file.c index 9d956cd6d46f..363402fcb3ed 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -34,7 +34,7 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos); + return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); } static ssize_t @@ -51,7 +51,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) host_file = cfi->cfi_container; file_start_write(host_file); inode_lock(coda_inode); - ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); + ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 11d087b2b28e..2dd4a7af7dd7 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -739,23 +739,22 @@ static int do_i2c_smbus_ioctl(struct file *file, unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; - compat_caddr_t datap; + union { + /* beginnings of those have identical layouts */ + struct i2c_smbus_ioctl_data32 data32; + struct i2c_smbus_ioctl_data data; + } v; tdata = compat_alloc_user_space(sizeof(*tdata)); if (tdata == NULL) return -ENOMEM; - if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) - return -EFAULT; - if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) + memset(&v, 0, sizeof(v)); + if (copy_from_user(&v.data32, udata, sizeof(v.data32))) return -EFAULT; + v.data.data = compat_ptr(v.data32.data); - if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8))) - return -EFAULT; - if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32))) - return -EFAULT; - if (__get_user(datap, &udata->data) || - __put_user(compat_ptr(datap), &tdata->data)) + if (copy_to_user(tdata, &v.data, sizeof(v.data))) return -EFAULT; return do_ioctl(file, cmd, (unsigned long)tdata); @@ -833,7 +832,7 @@ static int compat_ioctl_preallocate(struct file *file, */ #define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) -#define COMPATIBLE_IOCTL(cmd) XFORM(cmd), +#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), /* ioctl should not be warned about even if it's not implemented. Valid reasons to use this: - It is implemented with ->compat_ioctl on some device, but programs @@ -866,8 +865,6 @@ COMPATIBLE_IOCTL(TIOCGDEV) COMPATIBLE_IOCTL(TIOCCBRK) COMPATIBLE_IOCTL(TIOCGSID) COMPATIBLE_IOCTL(TIOCGICOUNT) -COMPATIBLE_IOCTL(TIOCGPKT) -COMPATIBLE_IOCTL(TIOCGPTLCK) COMPATIBLE_IOCTL(TIOCGEXCL) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) @@ -883,16 +880,12 @@ COMPATIBLE_IOCTL(TIOCMGET) COMPATIBLE_IOCTL(TIOCMBIC) COMPATIBLE_IOCTL(TIOCMBIS) COMPATIBLE_IOCTL(TIOCMSET) -COMPATIBLE_IOCTL(TIOCPKT) COMPATIBLE_IOCTL(TIOCNOTTY) COMPATIBLE_IOCTL(TIOCSTI) COMPATIBLE_IOCTL(TIOCOUTQ) COMPATIBLE_IOCTL(TIOCSPGRP) COMPATIBLE_IOCTL(TIOCGPGRP) -COMPATIBLE_IOCTL(TIOCGPTN) -COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) -COMPATIBLE_IOCTL(TIOCSIG) #ifdef TIOCSRS485 COMPATIBLE_IOCTL(TIOCSRS485) #endif diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 8b2a994042dd..a66f6624d899 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -138,6 +138,14 @@ struct config_item *config_item_get(struct config_item *item) } EXPORT_SYMBOL(config_item_get); +struct config_item *config_item_get_unless_zero(struct config_item *item) +{ + if (item && kref_get_unless_zero(&item->ci_kref)) + return item; + return NULL; +} +EXPORT_SYMBOL(config_item_get_unless_zero); + static void config_item_cleanup(struct config_item *item) { struct config_item_type *t = item->ci_type; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index a6ab012a2c6a..c8aabba502f6 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -83,14 +83,13 @@ static int create_link(struct config_item *parent_item, ret = -ENOMEM; sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); if (sl) { - sl->sl_target = config_item_get(item); spin_lock(&configfs_dirent_lock); if (target_sd->s_type & CONFIGFS_USET_DROPPING) { spin_unlock(&configfs_dirent_lock); - config_item_put(item); kfree(sl); return -ENOENT; } + sl->sl_target = config_item_get(item); list_add(&sl->sl_list, &target_sd->s_links); spin_unlock(&configfs_dirent_lock); ret = configfs_create_link(sl, parent_item->ci_dentry, diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 08b46e6e3995..02b7d91c9231 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -7,6 +7,7 @@ config FS_ENCRYPTION select CRYPTO_XTS select CRYPTO_CTS select CRYPTO_CTR + select CRYPTO_SHA256 select KEYS help Enable encryption of files and directories. This diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a409a84f1bca..6181e9526860 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, goto errout; } err = submit_bio_wait(bio); - if ((err == 0) && bio->bi_error) + if (err == 0 && bio->bi_status) err = -EIO; bio_put(bio); if (err) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 6d6eca394d4d..c7835df7e7b8 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/dcache.h> #include <linux/namei.h> +#include <crypto/aes.h> #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; @@ -147,8 +148,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, { struct { __le64 index; - u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; - } xts_tweak; + u8 padding[FS_IV_SIZE - sizeof(__le64)]; + } iv; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -158,6 +159,16 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, BUG_ON(len == 0); + BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE); + BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE); + iv.index = cpu_to_le64(lblk_num); + memset(iv.padding, 0, sizeof(iv.padding)); + + if (ci->ci_essiv_tfm != NULL) { + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv, + (u8 *)&iv); + } + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -170,15 +181,11 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(lblk_num); - memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); - sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); sg_set_page(&src, src_page, len, offs); - skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -477,6 +484,8 @@ static void __exit fscrypt_exit(void) destroy_workqueue(fscrypt_read_workqueue); kmem_cache_destroy(fscrypt_ctx_cachep); kmem_cache_destroy(fscrypt_info_cachep); + + fscrypt_essiv_cleanup(); } module_exit(fscrypt_exit); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d1bb02b1ee58..ad9f814fdead 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -453,12 +453,3 @@ errout: return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); - -void fscrypt_free_filename(struct fscrypt_name *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} -EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 1e1f8a361b75..a1d5021c31ef 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,10 +12,13 @@ #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt_supp.h> +#include <crypto/hash.h> /* Encryption parameters */ -#define FS_XTS_TWEAK_SIZE 16 +#define FS_IV_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_128_CBC_KEY_SIZE 16 +#define FS_AES_128_CTS_KEY_SIZE 16 #define FS_AES_256_GCM_KEY_SIZE 32 #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 @@ -54,6 +57,7 @@ struct fscrypt_info { u8 ci_filename_mode; u8 ci_flags; struct crypto_skcipher *ci_ctfm; + struct crypto_cipher *ci_essiv_tfm; u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; }; @@ -87,4 +91,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +/* keyinfo.c */ +extern void __exit fscrypt_essiv_cleanup(void); + #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 179e578b875b..018c588c7ac3 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,8 +10,13 @@ #include <keys/user-type.h> #include <linux/scatterlist.h> +#include <linux/ratelimit.h> +#include <crypto/aes.h> +#include <crypto/sha.h> #include "fscrypt_private.h" +static struct crypto_shash *essiv_hash_tfm; + static void derive_crypt_complete(struct crypto_async_request *req, int rc) { struct fscrypt_completion_result *ecr = req->data; @@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. + * @derived_raw_key: Derived raw key. * * Return: Zero on success; non-zero otherwise. */ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - u8 source_key[FS_AES_256_XTS_KEY_SIZE], - u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) + const struct fscrypt_key *source_key, + u8 derived_raw_key[FS_MAX_KEY_SIZE]) { int res = 0; struct skcipher_request *req = NULL; @@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], if (res < 0) goto out; - sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - FS_AES_256_XTS_KEY_SIZE, NULL); + sg_init_one(&src_sg, source_key->raw, source_key->size); + sg_init_one(&dst_sg, derived_raw_key, source_key->size); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + NULL); res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); @@ -77,7 +82,7 @@ out: static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - const char *prefix) + const char *prefix, int min_keysize) { char *description; struct key *keyring_key; @@ -111,50 +116,60 @@ static int validate_user_key(struct fscrypt_info *crypt_info, master_key = (struct fscrypt_key *)ukp->data; BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE + || master_key->size % AES_BLOCK_SIZE != 0) { printk_once(KERN_WARNING "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; goto out; } - res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + res = derive_key_aes(ctx->nonce, master_key, raw_key); out: up_read(&keyring_key->sem); key_put(keyring_key); return res; } +static const struct { + const char *cipher_str; + int keysize; +} available_modes[] = { + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", + FS_AES_256_XTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", + FS_AES_256_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", + FS_AES_128_CBC_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", + FS_AES_128_CTS_KEY_SIZE }, +}; + static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, const char **cipher_str_ret, int *keysize_ret) { - if (S_ISREG(inode->i_mode)) { - if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { - *cipher_str_ret = "xts(aes)"; - *keysize_ret = FS_AES_256_XTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported contents encryption mode " - "%d for inode %lu\n", - ci->ci_data_mode, inode->i_ino); - return -ENOKEY; + u32 mode; + + if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { + pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", + inode->i_ino, + ci->ci_data_mode, ci->ci_filename_mode); + return -EINVAL; } - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { - *cipher_str_ret = "cts(cbc(aes))"; - *keysize_ret = FS_AES_256_CTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported filenames encryption mode " - "%d for inode %lu\n", - ci->ci_filename_mode, inode->i_ino); - return -ENOKEY; + if (S_ISREG(inode->i_mode)) { + mode = ci->ci_data_mode; + } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + mode = ci->ci_filename_mode; + } else { + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return -EINVAL; } - pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", - (inode->i_mode & S_IFMT), inode->i_ino); - return -ENOKEY; + *cipher_str_ret = available_modes[mode].cipher_str; + *keysize_ret = available_modes[mode].keysize; + return 0; } static void put_crypt_info(struct fscrypt_info *ci) @@ -163,9 +178,76 @@ static void put_crypt_info(struct fscrypt_info *ci) return; crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); kmem_cache_free(fscrypt_info_cachep, ci); } +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + desc->flags = 0; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +void __exit fscrypt_essiv_cleanup(void) +{ + crypto_free_shash(essiv_hash_tfm); +} + int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; @@ -212,6 +294,7 @@ int fscrypt_get_encryption_info(struct inode *inode) crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; + crypt_info->ci_essiv_tfm = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -228,10 +311,12 @@ int fscrypt_get_encryption_info(struct inode *inode) if (!raw_key) goto out; - res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, + keysize); if (res && inode->i_sb->s_cop->key_prefix) { int res2 = validate_user_key(crypt_info, &ctx, raw_key, - inode->i_sb->s_cop->key_prefix); + inode->i_sb->s_cop->key_prefix, + keysize); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -243,18 +328,30 @@ int fscrypt_get_encryption_info(struct inode *inode) ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); + pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", + __func__, res, inode->i_ino); goto out; } crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + /* + * if the provided key is longer than keysize, we use the first + * keysize bytes of the derived key only + */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; + if (S_ISREG(inode->i_mode) && + crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { + res = init_essiv_generator(crypt_info, raw_key, keysize); + if (res) { + pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", + __func__, res, inode->i_ino); + goto out; + } + } if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) crypt_info = NULL; out: diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 210976e7a269..ce07a86200f3 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -38,12 +38,8 @@ static int create_encryption_context_from_policy(struct inode *inode, memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); - if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) - return -EINVAL; - - if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) return -EINVAL; if (policy->flags & ~FS_POLICY_FLAGS_VALID) @@ -260,6 +256,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, memcpy(ctx.master_key_descriptor, ci->ci_master_key, FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); if (res) @@ -25,7 +25,6 @@ #include <linux/mm.h> #include <linux/mutex.h> #include <linux/pagevec.h> -#include <linux/pmem.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/uio.h> @@ -84,7 +83,7 @@ struct exceptional_entry_key { }; struct wait_exceptional_entry_queue { - wait_queue_t wait; + wait_queue_entry_t wait; struct exceptional_entry_key key; }; @@ -108,7 +107,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; @@ -461,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) } /* - * Invalidate exceptional DAX entry if easily possible. This handles DAX - * entries for invalidate_inode_pages() so we evict the entry only if we can - * do so without blocking. - */ -int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - int ret = 0; - void *entry, **slot; - struct radix_tree_root *page_tree = &mapping->page_tree; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || - slot_locked(mapping, slot)) - goto out; - if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto out; - radix_tree_delete(page_tree, index); - mapping->nrexceptional--; - ret = 1; -out: - spin_unlock_irq(&mapping->tree_lock); - if (ret) - dax_wake_mapping_entry_waiter(mapping, index, entry, true); - return ret; -} - -/* * Invalidate exceptional DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, @@ -509,21 +479,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { + struct inode *inode = mapping->host; struct page *page; int ret; /* Hole page already exists? Return it... */ if (!radix_tree_exceptional_entry(*entry)) { page = *entry; - goto out; + goto finish_fault; } /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - out: + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + +finish_fault: vmf->page = page; ret = finish_fault(vmf); vmf->page = NULL; @@ -531,8 +505,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry, if (!ret) { /* Grab reference for PTE that is now referencing the page */ get_page(page); - return VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; } +out: + trace_dax_load_hole(inode, vmf, ret); return ret; } @@ -807,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev, } dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - wb_cache_pmem(kaddr, size); + dax_flush(dax_dev, pgoff, kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -817,6 +793,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); + trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); dax_unlock: dax_read_unlock(id); put_locked_mapping_entry(mapping, index, entry); @@ -857,6 +834,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; + trace_dax_writeback_range(inode, start_index, end_index); + tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); @@ -877,13 +856,16 @@ int dax_writeback_mapping_range(struct address_space *mapping, ret = dax_writeback_one(bdev, dax_dev, mapping, indices[i], pvec.pages[i]); if (ret < 0) { - put_dax(dax_dev); - return ret; + mapping_set_error(mapping, ret); + goto out; } } + start_index = indices[pvec.nr - 1] + 1; } +out: put_dax(dax_dev); - return 0; + trace_dax_writeback_range_done(inode, start_index, end_index); + return (ret < 0 ? ret : 0); } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); @@ -916,6 +898,7 @@ static int dax_insert_mapping(struct address_space *mapping, return PTR_ERR(ret); *entryp = ret; + trace_dax_insert_mapping(mapping->host, vmf, ret); return vm_insert_mixed(vma, vaddr, pfn); } @@ -927,6 +910,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; void *entry, **slot; pgoff_t index = vmf->pgoff; @@ -936,6 +920,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) if (entry) put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); + trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); @@ -948,6 +933,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) */ finish_mkwrite_fault(vmf); put_locked_mapping_entry(mapping, index, entry); + trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -980,18 +966,19 @@ int __dax_zero_page_range(struct block_device *bdev, void *kaddr; pfn_t pfn; - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); if (rc) return rc; id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, + rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); if (rc < 0) { dax_read_unlock(id); return rc; } - clear_pmem(kaddr + offset, size); + memset(kaddr + offset, 0, size); + dax_flush(dax_dev, pgoff, kaddr + offset, size); dax_read_unlock(id); } return 0; @@ -1031,7 +1018,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { + if (iomap->flags & IOMAP_F_NEW) { invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); @@ -1070,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, map_len = end - pos; if (iov_iter_rw(iter) == WRITE) - map_len = copy_from_iter_pmem(kaddr, map_len, iter); + map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, + map_len, iter); else map_len = copy_to_iter(kaddr, map_len, iter); if (map_len <= 0) { @@ -1150,34 +1138,50 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, int vmf_ret = 0; void *entry; + trace_dax_pte_fault(inode, vmf, vmf_ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) - return VM_FAULT_SIGBUS; + if (pos >= i_size_read(inode)) { + vmf_ret = VM_FAULT_SIGBUS; + goto out; + } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); + if (IS_ERR(entry)) { + vmf_ret = dax_fault_return(PTR_ERR(entry)); + goto out; + } + + /* + * It is possible, particularly with mixed reads & writes to private + * mappings, that we have raced with a PMD fault that overlaps with + * the PTE we need to set up. If so just return and the fault will be + * retried. + */ + if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { + vmf_ret = VM_FAULT_NOPAGE; + goto unlock_entry; + } + /* * Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); - if (error) - return dax_fault_return(error); - if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ - goto finish_iomap; + if (error) { + vmf_ret = dax_fault_return(error); + goto unlock_entry; } - - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); - goto finish_iomap; + if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { + error = -EIO; /* fs corruption? */ + goto error_finish_iomap; } sector = dax_iomap_sector(&iomap, pos); @@ -1199,20 +1203,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } if (error) - goto error_unlock_entry; + goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); vmf_ret = finish_fault(vmf); if (!vmf_ret) vmf_ret = VM_FAULT_DONE_COW; - goto unlock_entry; + goto finish_iomap; } switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, @@ -1225,7 +1229,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { vmf_ret = dax_load_hole(mapping, &entry, vmf); - goto unlock_entry; + goto finish_iomap; } /*FALLTHRU*/ default: @@ -1234,10 +1238,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, break; } - error_unlock_entry: + error_finish_iomap: vmf_ret = dax_fault_return(error) | major; - unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; @@ -1252,6 +1254,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: + trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } @@ -1397,6 +1403,28 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, goto fallback; /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto fallback; + + /* + * It is possible, particularly with mixed reads & writes to private + * mappings, that we have raced with a PTE fault that overlaps with + * the PMD we need to set up. If so just return and the fault will be + * retried. + */ + if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && + !pmd_devmap(*vmf->pmd)) { + result = 0; + goto unlock_entry; + } + + /* * Note that we don't use iomap_apply here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. @@ -1404,21 +1432,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pos = (loff_t)pgoff << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) - goto fallback; + goto unlock_entry; if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; - /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. - */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) - goto finish_iomap; - switch (iomap.type) { case IOMAP_MAPPED: result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); @@ -1426,7 +1444,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) - goto unlock_entry; + break; result = dax_pmd_load_hole(vmf, &iomap, &entry); break; default: @@ -1434,8 +1452,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, break; } - unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PMD_SIZE; @@ -1451,6 +1467,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, &iomap); } + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/fs/dcache.c b/fs/dcache.c index 95d71eda8142..7ece68d0d4db 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -277,6 +277,33 @@ static inline int dname_external(const struct dentry *dentry) return dentry->d_name.name != dentry->d_iname; } +void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (unlikely(dname_external(dentry))) { + struct external_name *p = external_name(dentry); + atomic_inc(&p->u.count); + spin_unlock(&dentry->d_lock); + name->name = p->name; + } else { + memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + spin_unlock(&dentry->d_lock); + name->name = name->inline_name; + } +} +EXPORT_SYMBOL(take_dentry_name_snapshot); + +void release_dentry_name_snapshot(struct name_snapshot *name) +{ + if (unlikely(name->name != name->inline_name)) { + struct external_name *p; + p = container_of(name->name, struct external_name, name[0]); + if (unlikely(atomic_dec_and_test(&p->u.count))) + kfree_rcu(p, u.head); + } +} +EXPORT_SYMBOL(release_dentry_name_snapshot); + static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) @@ -419,6 +446,8 @@ static void dentry_lru_add(struct dentry *dentry) { if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); + else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) + dentry->d_flags |= DCACHE_REFERENCED; } /** @@ -779,8 +808,6 @@ repeat: goto kill_it; } - if (!(dentry->d_flags & DCACHE_REFERENCED)) - dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); dentry->d_lockref.count--; @@ -1494,7 +1521,7 @@ static void check_and_drop(void *_data) { struct detach_data *data = _data; - if (!data->mountpoint && !data->select.found) + if (!data->mountpoint && list_empty(&data->select.dispose)) __d_drop(data->select.start); } @@ -1536,17 +1563,15 @@ void d_invalidate(struct dentry *dentry) d_walk(dentry, &data, detach_and_collect, check_and_drop); - if (data.select.found) + if (!list_empty(&data.select.dispose)) shrink_dentry_list(&data.select.dispose); + else if (!data.mountpoint) + return; if (data.mountpoint) { detach_mounts(data.mountpoint); dput(data.mountpoint); } - - if (!data.mountpoint && !data.select.found) - break; - cond_resched(); } } @@ -3548,8 +3573,6 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -3561,24 +3584,19 @@ static void __init dcache_init_early(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - unsigned int loop; - - /* + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature - * of the dcache. + * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); @@ -3592,14 +3610,11 @@ static void __init dcache_init(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - 0, + HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ @@ -3610,6 +3625,11 @@ EXPORT_SYMBOL(d_genocide); void __init vfs_caches_init_early(void) { + int i; + + for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) + INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); + dcache_init_early(); inode_init_early(); } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 354e2ab62031..6dabc4a10396 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -9,7 +9,7 @@ * 2 as published by the Free Software Foundation. * * debugfs is for people to use instead of /proc or /sys. - * See Documentation/DocBook/filesystems for more details. + * See Documentation/filesystems/ for more details. * */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7fd4ec4bb214..a0e4e2f7e0be 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -9,7 +9,7 @@ * 2 as published by the Free Software Foundation. * * debugfs is for people to use instead of /proc or /sys. - * See Documentation/DocBook/kernel-api for more details. + * See ./Documentation/core-api/kernel-api.rst for more details. * */ @@ -199,7 +199,7 @@ static const struct dentry_operations debugfs_dops = { static int debug_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr debug_files[] = {{""}}; + static const struct tree_descr debug_files[] = {{""}}; struct debugfs_fs_info *fsi; int err; @@ -766,7 +766,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, { int error; struct dentry *dentry = NULL, *trap; - const char *old_name; + struct name_snapshot old_name; trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ @@ -781,19 +781,19 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + take_dentry_name_snapshot(&old_name, old_dentry); error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); goto exit; } d_move(old_dentry, dentry); - fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name, + fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name.name, d_is_dir(old_dentry), NULL, old_dentry); - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); unlock_rename(new_dir, old_dir); dput(dentry); return old_dentry; diff --git a/fs/direct-io.c b/fs/direct-io.c index a04ebea77de8..08cf27811e5a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work) dio_complete(dio, 0, true); } -static int dio_bio_complete(struct dio *dio, struct bio *bio); +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. @@ -348,13 +348,12 @@ static void dio_bio_end_io(struct bio *bio) /** * dio_end_io - handle the end io action for the given bio * @bio: The direct io bio thats being completed - * @error: Error if there was one * * This is meant to be called by any filesystem that uses their own dio_submit_t * so that the DIO specific endio actions are dealt with after the filesystem * has done it's completion work. */ -void dio_end_io(struct bio *bio, int error) +void dio_end_io(struct bio *bio) { struct dio *dio = bio->bi_private; @@ -386,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, else bio->bi_end_io = dio_bio_end_io; + bio->bi_write_hint = dio->iocb->ki_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } @@ -474,17 +475,20 @@ static struct bio *dio_await_one(struct dio *dio) /* * Process one completed BIO. No locks are held. */ -static int dio_bio_complete(struct dio *dio, struct bio *bio) +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { struct bio_vec *bvec; unsigned i; - int err; + blk_status_t err = bio->bi_status; - if (bio->bi_error) - dio->io_error = -EIO; + if (err) { + if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) + dio->io_error = -EAGAIN; + else + dio->io_error = -EIO; + } if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { - err = bio->bi_error; bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { @@ -495,7 +499,6 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) set_page_dirty_lock(page); put_page(page); } - err = bio->bi_error; bio_put(bio); } return err; @@ -539,7 +542,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) bio = dio->bio_list; dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); - ret2 = dio_bio_complete(dio, bio); + ret2 = blk_status_to_errno(dio_bio_complete(dio, bio)); if (ret == 0) ret = ret2; } @@ -1197,6 +1200,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (iov_iter_rw(iter) == WRITE) { dio->op = REQ_OP_WRITE; dio->op_flags = REQ_SYNC | REQ_IDLE; + if (iocb->ki_flags & IOCB_NOWAIT) + dio->op_flags |= REQ_NOWAIT; } else { dio->op = REQ_OP_READ; } diff --git a/fs/eventfd.c b/fs/eventfd.c index 68b9fffcb2c8..2fb4eadaa118 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ -int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { unsigned long flags; @@ -215,8 +215,8 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); * * Returns %0 if successful, or the following error codes: * - * -EAGAIN : The operation would have blocked but @no_wait was non-zero. - * -ERESTARTSYS : A signal interrupted the wait operation. + * - -EAGAIN : The operation would have blocked but @no_wait was non-zero. + * - -ERESTARTSYS : A signal interrupted the wait operation. * * If @no_wait is zero, the function might sleep until the eventfd internal * counter becomes greater than zero. diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5420767c9b68..b1c8e23ddf65 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -244,7 +244,7 @@ struct eppoll_entry { * Wait queue item that will be linked to the target file wait * queue head. */ - wait_queue_t wait; + wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; @@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } -static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ -static inline struct epitem *ep_item_from_wait(wait_queue_t *p) +static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } @@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) * mechanism. It is called by the stored file descriptors when they * have events to report. */ -static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; @@ -1094,7 +1094,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * can't use __remove_wait_queue(). whead->lock is held by * the caller. */ - list_del_init(&wait->task_list); + list_del_init(&wait->entry); } spin_lock_irqsave(&ep->lock, flags); @@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int res = 0, eavail, timed_out = 0; unsigned long flags; u64 slack = 0; - wait_queue_t wait; + wait_queue_entry_t wait; ktime_t expires, *to = NULL; if (timeout > 0) { diff --git a/fs/exec.c b/fs/exec.c index 72934df68471..62175cbcc801 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -220,7 +220,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (write) { unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; - struct rlimit *rlim; + unsigned long ptr_size, limit; + + /* + * Since the stack will hold pointers to the strings, we + * must account for them as well. + * + * The size calculation is the entire vma while each arg page is + * built, so each time we get here it's calculating how far it + * is currently (rather than each call being just the newly + * added size from the arg page). As a result, we need to + * always add the entire size of the pointers, so that on the + * last call to get_arg_page() we'll actually have the entire + * correct size. + */ + ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + if (ptr_size > ULONG_MAX - size) + goto fail; + size += ptr_size; acct_arg_size(bprm, size / PAGE_SIZE); @@ -232,20 +249,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return page; /* - * Limit to 1/4-th the stack size for the argv+env strings. + * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM + * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ - rlim = current->signal->rlim; - if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { - put_page(page); - return NULL; - } + limit = _STK_LIM / 4 * 3; + limit = min(limit, rlimit(RLIMIT_STACK) / 4); + if (size > limit) + goto fail; } return page; + +fail: + put_page(page); + return NULL; } static void put_arg_page(struct page *page) diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 42f9a0a0c4ca..98233a97b7b8 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -72,7 +72,7 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) set_page_dirty(page); if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); @@ -405,8 +405,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, int err; lock_page(page); - err = exofs_write_begin(NULL, page->mapping, pos, len, - AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL); if (err) EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n", err); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index d9650c9508e4..e2709695b177 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -100,7 +100,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) } if (IS_DIRSYNC(dir)) { - err = write_one_page(page, 1); + err = write_one_page(page); if (!err) err = sync_inode_metadata(dir, 1); } else { diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 03f5ce1d3dbe..23ebb92484c6 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -113,7 +113,7 @@ struct ext2_sb_info { * of the mount options. */ spinlock_t s_lock; - struct mb_cache *s_mb_cache; + struct mb_cache *s_ea_block_cache; }; static inline spinlock_t * diff --git a/fs/ext2/file.c b/fs/ext2/file.c index b21891a6bfca..d34d32bdc944 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -174,15 +174,12 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int ret; struct super_block *sb = file->f_mapping->host->i_sb; - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; ret = generic_file_fsync(file, start, end, datasync); - if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { + if (ret == -EIO) /* We don't really know where the IO error happened... */ ext2_error(sb, __func__, "detected IO error when writing metadata buffers"); - ret = -EIO; - } return ret; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 26d77f9f8c12..2dcbd5698884 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -817,7 +817,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->bdev = bdev; iomap->offset = (u64)first_block << blkbits; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; @@ -841,7 +841,7 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 8ac673c71a36..7b1bc9059863 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -32,6 +32,7 @@ #include <linux/log2.h> #include <linux/quotaops.h> #include <linux/uaccess.h> +#include <linux/dax.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -146,9 +147,9 @@ static void ext2_put_super (struct super_block * sb) ext2_quota_off_umount(sb); - if (sbi->s_mb_cache) { - ext2_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_block_cache) { + ext2_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -1130,9 +1131,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } #ifdef CONFIG_EXT2_FS_XATTR - sbi->s_mb_cache = ext2_xattr_create_cache(); - if (!sbi->s_mb_cache) { - ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + sbi->s_ea_block_cache = ext2_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { + ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); goto failed_mount3; } #endif @@ -1181,8 +1182,8 @@ cantfind_ext2: sb->s_id); goto failed_mount; failed_mount3: - if (sbi->s_mb_cache) - ext2_xattr_destroy_cache(sbi->s_mb_cache); + if (sbi->s_ea_block_cache) + ext2_xattr_destroy_cache(sbi->s_ea_block_cache); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index fbdb8f171893..1b9b1268d418 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -121,6 +121,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = { NULL }; +#define EA_BLOCK_CACHE(inode) (EXT2_SB(inode->i_sb)->s_ea_block_cache) + static inline const struct xattr_handler * ext2_xattr_handler(int name_index) { @@ -150,7 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, size_t name_len, size; char *end; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -195,7 +197,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", goto found; entry = next; } - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); error = -ENODATA; goto cleanup; @@ -208,7 +210,7 @@ found: le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) goto bad_block; - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); if (buffer) { error = -ERANGE; @@ -246,7 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) char *end; size_t rest = buffer_size; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -281,7 +283,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", goto bad_block; entry = next; } - if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + if (ext2_xattr_cache_insert(ea_block_cache, bh)) ea_idebug(inode, "cache insert failed"); /* list the attribute names */ @@ -493,8 +495,8 @@ bad_block: ext2_error(sb, "ext2_xattr_set", * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect modified block */ - mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash, + bh->b_blocknr); /* keep the buffer locked while modifying it. */ } else { @@ -627,7 +629,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; int error; - struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (header) { new_bh = ext2_xattr_cache_find(inode, header); @@ -655,7 +657,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, don't need to change the reference count. */ new_bh = old_bh; get_bh(new_bh); - ext2_xattr_cache_insert(ext2_mb_cache, new_bh); + ext2_xattr_cache_insert(ea_block_cache, new_bh); } else { /* We need to allocate a new block */ ext2_fsblk_t goal = ext2_group_first_block_no(sb, @@ -676,7 +678,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, memcpy(new_bh->b_data, header, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext2_xattr_cache_insert(ext2_mb_cache, new_bh); + ext2_xattr_cache_insert(ea_block_cache, new_bh); ext2_xattr_update_super_block(sb); } @@ -721,8 +723,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect freed block */ - mb_cache_entry_delete_block(ext2_mb_cache, - hash, old_bh->b_blocknr); + mb_cache_entry_delete(ea_block_cache, hash, + old_bh->b_blocknr); /* Free the old block. */ ea_bdebug(old_bh, "freeing"); ext2_free_blocks(inode, old_bh->b_blocknr, 1); @@ -795,8 +797,8 @@ ext2_xattr_delete_inode(struct inode *inode) * This must happen under buffer lock for ext2_xattr_set2() to * reliably detect freed block */ - mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash, + bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); @@ -897,21 +899,21 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; - struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext2_mb_cache, hash); + ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_block); + bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_cache_find", "inode %ld: block %ld read error", - inode->i_ino, (unsigned long) ce->e_block); + inode->i_ino, (unsigned long) ce->e_value); } else { lock_buffer(bh); /* @@ -924,27 +926,27 @@ again: * entry is still hashed is reliable. */ if (hlist_bl_unhashed(&ce->e_hash_list)) { - mb_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); unlock_buffer(bh); brelse(bh); goto again; } else if (le32_to_cpu(HDR(bh)->h_refcount) > EXT2_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %ld refcount %d>%d", - (unsigned long) ce->e_block, + (unsigned long) ce->e_value, le32_to_cpu(HDR(bh)->h_refcount), EXT2_XATTR_REFCOUNT_MAX); } else if (!ext2_xattr_cmp(header, HDR(bh))) { ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); - mb_cache_entry_touch(ext2_mb_cache, ce); - mb_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_touch(ea_block_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); return bh; } unlock_buffer(bh); brelse(bh); } - ce = mb_cache_entry_find_next(ext2_mb_cache, ce); + ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fd389935ecd1..09441ae07a5b 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -4,6 +4,7 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ +#include <linux/quotaops.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -182,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type) */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, - struct posix_acl *acl) + struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; @@ -217,7 +218,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } error = ext4_xattr_set_handle(handle, inode, name_index, "", - value, size, 0); + value, size, xattr_flags); kfree(value); if (!error) @@ -230,15 +231,23 @@ int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; - int error, retries = 0; + int error, credits, retries = 0; + size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + error = dquot_initialize(inode); + if (error) + return error; retry: - handle = ext4_journal_start(inode, EXT4_HT_XATTR, - ext4_jbd2_credits_xattr(inode)); + error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, + &credits); + if (error) + return error; + + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); - error = __ext4_set_acl(handle, inode, type, acl); + error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -263,13 +272,13 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, - default_acl); + default_acl, XATTR_CREATE); posix_acl_release(default_acl); } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, - acl); + acl, XATTR_CREATE); posix_acl_release(acl); } return error; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8e8046104f4d..9ebde0cd632e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1114,6 +1114,7 @@ struct ext4_inode_info { /* * Mount flags set via mount options or defaults */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ @@ -1444,6 +1445,8 @@ struct ext4_sb_info { unsigned int *s_mb_maxs; unsigned int s_group_info_size; unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ /* tunables */ unsigned long s_stripe; @@ -1516,7 +1519,8 @@ struct ext4_sb_info { struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; - struct mb_cache *s_mb_cache; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ @@ -1797,10 +1801,12 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ - EXT4_FEATURE_INCOMPAT_CSUM_SEED) + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -2098,6 +2104,12 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + /* * This structure is stuffed into the struct file's private_data field * for directories. It is where we put information so that we can do @@ -2126,6 +2138,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + /* * Timeout and state flag for lazy initialization inode thread. */ @@ -2389,16 +2411,17 @@ extern int ext4fs_dirhash(const char *name, int len, struct /* ialloc.c */ extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, - uid_t *owner, int handle_type, - unsigned int line_no, int nblocks); + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); -#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ - 0, 0, 0) + i_flags, 0, 0, 0) #define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ type, nblocks) \ __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ - (type), __LINE__, (nblocks)) + 0, (type), __LINE__, (nblocks)) extern void ext4_free_inode(handle_t *, struct inode *); @@ -2433,6 +2456,7 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb, extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); /* inode.c */ int ext4_inode_is_fast_symlink(struct inode *inode); @@ -2523,7 +2547,6 @@ extern int ext4_search_dir(struct buffer_head *bh, int buf_size, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(handle_t *handle, @@ -2705,19 +2728,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_group_desc_csum(struct super_block *sb) -{ - return ext4_has_feature_gdt_csum(sb) || - EXT4_SB(sb)->s_chksum_driver != NULL; -} - static inline int ext4_has_metadata_csum(struct super_block *sb) { WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && !EXT4_SB(sb)->s_chksum_driver); - return (EXT4_SB(sb)->s_chksum_driver != NULL); + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); } + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | @@ -2757,13 +2781,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) { - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); - else - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) @@ -3007,7 +3033,6 @@ extern int htree_inlinedir_to_tree(struct file *dir_file, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); extern int ext4_delete_inline_entry(handle_t *handle, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index f97611171023..dabad1bc8617 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -77,7 +77,14 @@ #define EXT4_RESERVE_TRANS_BLOCKS 12U -#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 +/* + * Number of credits needed if we need to insert an entry into a + * directory. For each new index block, we need 4 blocks (old index + * block, new index block, bitmap block, bg summary). For normal + * htree directories there are 2 levels; if the largedir feature + * enabled it's 3 levels. + */ +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was @@ -104,20 +111,6 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) -static inline int ext4_jbd2_credits_xattr(struct inode *inode) -{ - int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); - - /* - * In case of inline data, we may push out the data to a block, - * so we need to reserve credits for this eventuality - */ - if (ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; - return credits; -} - - /* * Ext4 handle operation types -- for logging purposes */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2a97dff87b96..e0a8425ff74d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2488,7 +2488,8 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) static inline int get_default_free_blocks_flags(struct inode *inode) { - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; @@ -3413,13 +3414,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; - struct ext4_extent zero_ex; + struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; - int split_flag = 0; + int split_flag = EXT4_EXT_DATA_VALID2; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -3436,7 +3437,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - zero_ex.ee_len = 0; + zero_ex1.ee_len = 0; + zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3576,62 +3578,52 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (ext4_encrypted_inode(inode)) max_zeroout = 0; - /* If extent is less than s_max_zeroout_kb, zeroout directly */ - if (max_zeroout && (ee_len <= max_zeroout)) { - err = ext4_ext_zeroout(inode, ex); - if (err) - goto out; - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); - ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - goto out; - } - /* - * four cases: + * five cases: * 1. split the extent into three extents. - * 2. split the extent into two extents, zeroout the first half. - * 3. split the extent into two extents, zeroout the second half. + * 2. split the extent into two extents, zeroout the head of the first + * extent. + * 3. split the extent into two extents, zeroout the tail of the second + * extent. * 4. split the extent into two extents with out zeroout. + * 5. no splitting needed, just possibly zeroout the head and / or the + * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (max_zeroout && (allocated > map->m_len)) { + if (max_zeroout && (allocated > split_map.m_len)) { if (allocated <= max_zeroout) { - /* case 3 */ - zero_ex.ee_block = - cpu_to_le32(map->m_lblk); - zero_ex.ee_len = cpu_to_le16(allocated); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex) + map->m_lblk - ee_block); - err = ext4_ext_zeroout(inode, &zero_ex); + /* case 3 or 5 */ + zero_ex1.ee_block = + cpu_to_le32(split_map.m_lblk + + split_map.m_len); + zero_ex1.ee_len = + cpu_to_le16(allocated - split_map.m_len); + ext4_ext_store_pblock(&zero_ex1, + ext4_ext_pblock(ex) + split_map.m_lblk + + split_map.m_len - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto out; - split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { - /* case 2 */ - if (map->m_lblk != ee_block) { - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(map->m_lblk - + } + if (split_map.m_lblk - ee_block + split_map.m_len < + max_zeroout) { + /* case 2 or 5 */ + if (split_map.m_lblk != ee_block) { + zero_ex2.ee_block = ex->ee_block; + zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); - ext4_ext_store_pblock(&zero_ex, + ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); - err = ext4_ext_zeroout(inode, &zero_ex); + err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto out; } + split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; - split_map.m_len = map->m_lblk - ee_block + map->m_len; allocated = map->m_len; } } @@ -3642,8 +3634,11 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = 0; out: /* If we have gotten a failure, don't zero out status tree */ - if (!err) - err = ext4_zeroout_es(inode, &zero_ex); + if (!err) { + err = ext4_zeroout_es(inode, &zero_ex1); + if (!err) + err = ext4_zeroout_es(inode, &zero_ex2); + } return err ? err : allocated; } @@ -4883,6 +4878,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); @@ -5569,6 +5566,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); @@ -5742,6 +5740,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index cefa9835f275..58294c9a7e1d 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -37,7 +37,11 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock_shared(inode); + if (!inode_trylock_shared(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock_shared(inode); + } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore @@ -179,7 +183,11 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -216,7 +224,12 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_dax_write_iter(iocb, from); #endif - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -235,9 +248,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->private = &overwrite; /* Check whether we do a DIO overwrite or not */ - if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && - ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) - overwrite = 1; + if (o_direct && !unaligned_aio) { + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + if (ext4_should_dioread_nolock(inode)) + overwrite = 1; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + } ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -257,6 +276,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { int result; + handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -264,12 +284,24 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + } else { + down_read(&EXT4_I(inode)->i_mmap_sem); } - down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); - up_read(&EXT4_I(inode)->i_mmap_sem); - if (write) + if (!IS_ERR(handle)) + result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); + else + result = VM_FAULT_SIGBUS; + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); + } else { + up_read(&EXT4_I(inode)->i_mmap_sem); + } return result; } @@ -332,13 +364,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (ext4_encrypted_inode(inode)) { - int err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -422,6 +447,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } + + /* Set the flags to support nowait AIO */ + filp->f_mode |= FMODE_AIO_NOWAIT; + return dquot_file_open(inode, filp); } @@ -461,57 +490,37 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, endoff = (loff_t)end_blk << blkbits; index = startoff >> PAGE_SHIFT; - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; pagevec_init(&pvec, 0); do { int i, num; unsigned long nr_pages; - num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); - if (nr_pages == 0) { - if (whence == SEEK_DATA) - break; - - BUG_ON(whence != SEEK_HOLE); - /* - * If this is the first time to go into the loop and - * offset is not beyond the end offset, it will be a - * hole at this offset - */ - if (lastoff == startoff || lastoff < endoff) - found = 1; + if (nr_pages == 0) break; - } - - /* - * If this is the first time to go into the loop and - * offset is smaller than the first page offset, it will be a - * hole at this offset. - */ - if (lastoff == startoff && whence == SEEK_HOLE && - lastoff < page_offset(pvec.pages[0])) { - found = 1; - break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; struct buffer_head *bh, *head; /* - * If the current offset is not beyond the end of given - * range, it will be a hole. + * If current offset is smaller than the page offset, + * there is a hole at this offset. */ - if (lastoff < endoff && whence == SEEK_HOLE && - page->index > end) { + if (whence == SEEK_HOLE && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { found = 1; *offset = lastoff; goto out; } + if (page->index > end) + goto out; + lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -551,20 +560,18 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); } - /* - * The no. of pages is less than our desired, that would be a - * hole in there. - */ - if (nr_pages < num && whence == SEEK_HOLE) { - found = 1; - *offset = lastoff; + /* The no. of pages is less than our desired, we are done. */ + if (nr_pages < num) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + if (whence == SEEK_HOLE && lastoff < endoff) { + found = 1; + *offset = lastoff; + } out: pagevec_release(&pvec); return found; diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b19436098837..7ec340898598 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -480,6 +480,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start_fsb; ext4_fsblk_t end_fsb; + ext4_fsblk_t bofs; ext4_fsblk_t eofs; ext4_group_t start_ag; ext4_group_t end_ag; @@ -487,9 +488,12 @@ static int ext4_getfsmap_datadev(struct super_block *sb, ext4_grpblk_t last_cluster; int error = 0; + bofs = le32_to_cpu(sbi->s_es->s_first_data_block); eofs = ext4_blocks_count(sbi->s_es); if (keys[0].fmr_physical >= eofs) return 0; + else if (keys[0].fmr_physical < bofs) + keys[0].fmr_physical = bofs; if (keys[1].fmr_physical >= eofs) keys[1].fmr_physical = eofs - 1; start_fsb = keys[0].fmr_physical; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 9d549608fd30..aae2c3971cef 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -124,7 +124,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 98ac2f1f23b3..507bfb3344d4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -294,7 +294,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) * as writing the quota to disk may need the lock as well. */ dquot_initialize(inode); - ext4_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); @@ -743,8 +742,9 @@ out: */ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, - __u32 goal, uid_t *owner, int handle_type, - unsigned int line_no, int nblocks) + __u32 goal, uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -766,30 +766,69 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + sb = dir->i_sb; + sbi = EXT4_SB(sb); + + if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && + !(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_get_encryption_info(dir); if (err) return ERR_PTR(err); if (!fscrypt_has_encryption_key(dir)) return ERR_PTR(-ENOKEY); - if (!handle) - nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); encrypt = 1; } - sb = dir->i_sb; + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never + * more than 1k. In practice they are under + * 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + } + ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); - sbi = EXT4_SB(sb); /* * Initialize owners and quota early so that we don't have to account @@ -1053,6 +1092,7 @@ got: /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_flags |= i_flags; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; @@ -1109,13 +1149,15 @@ got: goto fail_free_drop; } - err = ext4_init_acl(handle, inode, dir); - if (err) - goto fail_free_drop; + if (!(ei->i_flags & EXT4_EA_INODE_FL)) { + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; - err = ext4_init_security(handle, inode, dir, qstr); - if (err) - goto fail_free_drop; + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + } if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bc15c2c17633..7ffa290cbb8e 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -829,7 +829,8 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d5dea4c293ef..28c5c3abddb3 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, /* Compute min_offs. */ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - if (!entry->e_value_block && entry->e_value_size) { + if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; @@ -1627,7 +1627,6 @@ out: struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { @@ -1649,7 +1648,7 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, - dir, fname, d_name, 0, res_dir); + dir, fname, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) @@ -1662,7 +1661,7 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, - dir, fname, d_name, 0, res_dir); + dir, fname, 0, res_dir); if (ret == 1) goto out_find; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5834c4d76be8..3c600f02673f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -144,16 +144,12 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, /* * Test whether an inode is a fast symlink. + * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; - - if (ext4_has_inline_data(inode)) - return 0; - - return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + return S_ISLNK(inode->i_mode) && inode->i_size && + (inode->i_size < EXT4_N_BLOCKS * 4); } /* @@ -189,6 +185,8 @@ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; + int extra_credits = 3; + struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -213,7 +211,8 @@ void ext4_evict_inode(struct inode *inode) */ if (inode->i_ino != EXT4_JOURNAL_INO && ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_data.nrpages) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -238,8 +237,12 @@ void ext4_evict_inode(struct inode *inode) * protection against it */ sb_start_intwrite(inode->i_sb); + + if (!IS_NOQUOTA(inode)) + extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+3); + ext4_blocks_for_truncate(inode)+extra_credits); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -254,6 +257,16 @@ void ext4_evict_inode(struct inode *inode) if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* + * Set inode->i_size to 0 before calling ext4_truncate(). We need + * special handling of symlinks here because i_size is used to + * determine whether ext4_inode_info->i_data contains symlink data or + * block mappings. Setting i_size to 0 will remove its fast symlink + * status. Erase i_data so that it becomes a valid empty block map. + */ + if (ext4_inode_is_fast_symlink(inode)) + memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -271,25 +284,17 @@ void ext4_evict_inode(struct inode *inode) } } - /* - * ext4_ext_truncate() doesn't reserve any slop when it - * restarts journal transactions; therefore there may not be - * enough credits left in the handle to remove the inode from - * the orphan list and set the dtime field. - */ - if (!ext4_handle_has_enough_credits(handle, 3)) { - err = ext4_journal_extend(handle, 3); - if (err > 0) - err = ext4_journal_restart(handle, 3); - if (err != 0) { - ext4_warning(inode->i_sb, - "couldn't extend journal (err %d)", err); - stop_handle: - ext4_journal_stop(handle); - ext4_orphan_del(NULL, inode); - sb_end_intwrite(inode->i_sb); - goto no_delete; - } + /* Remove xattr references. */ + err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, + extra_credits); + if (err) { + ext4_warning(inode->i_sb, "xattr delete (err %d)", err); +stop_handle: + ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); + goto no_delete; } /* @@ -317,6 +322,7 @@ void ext4_evict_inode(struct inode *inode) ext4_free_inode(handle, inode); ext4_journal_stop(handle); sb_end_intwrite(inode->i_sb); + ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -710,7 +716,7 @@ out_sem: if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && - !IS_NOQUOTA(inode) && + !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode); @@ -2124,15 +2130,29 @@ static int ext4_writepage(struct page *page, static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; - loff_t size = i_size_read(mpd->inode); + loff_t size; int err; BUG_ON(page->index != mpd->first_page); + clear_page_dirty_for_io(page); + /* + * We have to be very careful here! Nothing protects writeback path + * against i_size changes and the page can be writeably mapped into + * page tables. So an application can be growing i_size and writing + * data through mmap while writeback runs. clear_page_dirty_for_io() + * write-protects our page in page tables and the page cannot get + * written to again until we release page lock. So only after + * clear_page_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_page() to zero-out tail of the written page. We rely + * on the barrier provided by TestClearPageDirty in + * clear_page_dirty_for_io() to make sure i_size is really sampled only + * after page tables are updated. + */ + size = i_size_read(mpd->inode); if (page->index == size >> PAGE_SHIFT) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - clear_page_dirty_for_io(page); err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; @@ -3412,7 +3432,7 @@ retry: bdev = inode->i_sb->s_bdev; iomap->bdev = bdev; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; iomap->offset = first_block << blkbits; @@ -3447,7 +3467,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; @@ -3629,9 +3649,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) get_block_func = ext4_dio_get_block_unwritten_async; dio_flags = DIO_LOCKING; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); -#endif ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); @@ -3713,7 +3730,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) */ inode_lock_shared(inode); ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count); + iocb->ki_pos + count - 1); if (ret) goto out_unlock; ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, @@ -4207,6 +4224,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: @@ -4699,7 +4718,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - inode->i_size = ext4_isize(raw_inode); + inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -4833,6 +4852,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } brelse(iloc.bh); ext4_set_inode_flags(inode); + + if (ei->i_flags & EXT4_EA_INODE_FL) { + ext4_xattr_inode_set_class(inode); + + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + } + unlock_new_inode(inode); return inode; @@ -5024,7 +5052,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (ei->i_disksize != ext4_isize(raw_inode)) { + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } @@ -5274,7 +5302,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } + + /* dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(inode, attr); + up_read(&EXT4_I(inode)->xattr_sem); + if (error) { ext4_journal_stop(handle); return error; @@ -5294,6 +5329,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; int shrink = (attr->ia_size <= inode->i_size); + if (ext4_encrypted_inode(inode)) { + error = fscrypt_get_encryption_info(inode); + if (error) + return error; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -5637,8 +5680,9 @@ static int ext4_expand_extra_isize(struct inode *inode, /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { - memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, - new_extra_isize); + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize, 0, + new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0c21e22acd74..42b3a73143cf 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -218,7 +218,7 @@ static int ext4_ioctl_setflags(struct inode *inode, unsigned int jflag; /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) + if (ext4_is_quota_file(inode)) goto flags_out; oldflags = ei->i_flags; @@ -342,7 +342,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) err = -EPERM; inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) + if (ext4_is_quota_file(inode)) goto out_unlock; err = ext4_get_inode_loc(inode, &iloc); @@ -373,7 +373,13 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { + + /* __dquot_transfer() calls back ext4_get_inode_usage() which + * counts xattr inode references. + */ + down_read(&EXT4_I(inode)->xattr_sem); err = __dquot_transfer(inode, transfer_to); + up_read(&EXT4_I(inode)->xattr_sem); dqput(transfer_to[PRJQUOTA]); if (err) goto out_dirty; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 36de58a37653..581e357e8406 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -367,8 +367,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -2393,7 +2391,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); - new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; @@ -2639,6 +2637,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; + INIT_LIST_HEAD(&sbi->s_freed_data_list); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -2782,7 +2781,8 @@ int ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count) + ext4_group_t block_group, ext4_grpblk_t cluster, int count, + struct bio **biop) { ext4_fsblk_t discard_block; @@ -2791,18 +2791,18 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + if (biop) { + return __blkdev_issue_discard(sb->s_bdev, + (sector_t)discard_block << (sb->s_blocksize_bits - 9), + (sector_t)count << (sb->s_blocksize_bits - 9), + GFP_NOFS, 0, biop); + } else + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } -/* - * This function is called by the jbd2 layer once the commit has finished, - * so we know we can free the blocks that were released with that commit. - */ -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc) +static void ext4_free_data_in_buddy(struct super_block *sb, + struct ext4_free_data *entry) { - struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; @@ -2810,18 +2810,6 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); - if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, - entry->efd_count); - if (err && err != -EOPNOTSUPP) - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%d failed" - " with %d", entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, err); - } - err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); @@ -2862,6 +2850,56 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_free_data *entry, *tmp; + struct bio *discard_bio = NULL; + struct list_head freed_data_list; + struct list_head *cut_pos = NULL; + int err; + + INIT_LIST_HEAD(&freed_data_list); + + spin_lock(&sbi->s_md_lock); + list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { + if (entry->efd_tid != commit_tid) + break; + cut_pos = &entry->efd_list; + } + if (cut_pos) + list_cut_position(&freed_data_list, &sbi->s_freed_data_list, + cut_pos); + spin_unlock(&sbi->s_md_lock); + + if (test_opt(sb, DISCARD)) { + list_for_each_entry(entry, &freed_data_list, efd_list) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, + &discard_bio); + if (err && err != -EOPNOTSUPP) { + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } else if (err == -EOPNOTSUPP) + break; + } + + if (discard_bio) + submit_bio_wait(discard_bio); + } + + list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + ext4_free_data_in_buddy(sb, entry); +} + int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, @@ -3529,7 +3567,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "prellocated %u for group %u\n", preallocated, group); + mb_debug(1, "preallocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3887,7 +3925,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, "Error loading buddy information for %u", group); + ext4_warning(sb, "Error %d loading buddy information for %u", + err, group); put_bh(bitmap_bh); return 0; } @@ -4044,10 +4083,11 @@ repeat: BUG_ON(pa->pa_type != MB_INODE_PA); group = ext4_get_group_number(sb, pa->pa_pstart); - err = ext4_mb_load_buddy(sb, group, &e4b); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); if (err) { - ext4_error(sb, "Error loading buddy information for %u", - group); + ext4_error(sb, "Error %d loading buddy information for %u", + err, group); continue; } @@ -4303,11 +4343,14 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + int err; group = ext4_get_group_number(sb, pa->pa_pstart); - if (ext4_mb_load_buddy(sb, group, &e4b)) { - ext4_error(sb, "Error loading buddy information for %u", - group); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) { + ext4_error(sb, "Error %d loading buddy information for %u", + err, group); continue; } ext4_lock_group(sb, group); @@ -4459,7 +4502,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, trace_ext4_request_blocks(ar); /* Allow to use superuser reservation for quota file */ - if (IS_NOQUOTA(ar->inode)) + if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { @@ -4578,14 +4621,28 @@ out: * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ -static int can_merge(struct ext4_free_data *entry1, - struct ext4_free_data *entry2) +static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, + struct ext4_free_data *entry, + struct ext4_free_data *new_entry, + struct rb_root *entry_rb_root) { - if ((entry1->efd_tid == entry2->efd_tid) && - (entry1->efd_group == entry2->efd_group) && - ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) - return 1; - return 0; + if ((entry->efd_tid != new_entry->efd_tid) || + (entry->efd_group != new_entry->efd_group)) + return; + if (entry->efd_start_cluster + entry->efd_count == + new_entry->efd_start_cluster) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; + } else if (new_entry->efd_start_cluster + new_entry->efd_count == + entry->efd_start_cluster) { + new_entry->efd_count += entry->efd_count; + } else + return; + spin_lock(&sbi->s_md_lock); + list_del(&entry->efd_list); + spin_unlock(&sbi->s_md_lock); + rb_erase(&entry->efd_node, entry_rb_root); + kmem_cache_free(ext4_free_data_cachep, entry); } static noinline_for_stack int @@ -4641,29 +4698,19 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry) && - ext4_journal_callback_try_del(handle, &entry->efd_jce)) { - new_entry->efd_start_cluster = entry->efd_start_cluster; - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - kmem_cache_free(ext4_free_data_cachep, entry); - } + ext4_try_merge_freed_extent(sbi, entry, new_entry, + &(db->bb_free_root)); } node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry) && - ext4_journal_callback_try_del(handle, &entry->efd_jce)) { - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - kmem_cache_free(ext4_free_data_cachep, entry); - } + ext4_try_merge_freed_extent(sbi, entry, new_entry, + &(db->bb_free_root)); } - /* Add the extent to transaction's private list */ - new_entry->efd_jce.jce_func = ext4_free_data_callback; + spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, &new_entry->efd_jce); + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); return 0; @@ -4866,7 +4913,8 @@ do_more: * them with group lock_held */ if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, block_group, bit, count); + err = ext4_issue_discard(sb, block_group, bit, count, + NULL); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%d block:%d count:%lu failed" @@ -5089,7 +5137,7 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count); + ret = ext4_issue_discard(sb, group, start, count, NULL); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; @@ -5127,8 +5175,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { - ext4_error(sb, "Error in loading buddy " - "information for %u", group); + ext4_warning(sb, "Error %d loading buddy information for %u", + ret, group); return ret; } bitmap = e4b.bd_bitmap; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 2bed62084a8c..009300ee1561 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -78,10 +78,8 @@ do { \ struct ext4_free_data { - /* MUST be the first member */ - struct ext4_journal_cb_entry efd_jce; - - /* ext4_free_data private data starts from here */ + /* this links the free block information from sb_info */ + struct list_head efd_list; /* this links the free block information from group_info */ struct rb_node efd_node; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 364ea4d4a943..cf5181b62df1 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), - S_IFREG, NULL, goal, owner); + S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c992ef2c2f94..9bb36909ec92 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -484,7 +484,7 @@ mext_check_arguments(struct inode *orig_inode, return -EBUSY; } - if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) { + if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " "not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b81f7d46f344..13f0cadb1238 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -513,7 +513,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { - return le32_to_cpu(entry->block) & 0x00ffffff; + return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) @@ -739,6 +739,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; @@ -768,9 +769,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } indirect = root->info.indirect_levels; - if (indirect > 1) { - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", - root->info.indirect_levels); + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed" + "supported value", dir->i_ino, + ext4_dir_htree_level(dir->i_sb)); + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { + ext4_warning(dir->i_sb, "Enable large directory " + "feature to access it"); + } goto fail; } @@ -859,12 +866,19 @@ fail: static void dx_release(struct dx_frame *frames) { + struct dx_root_info *info; + int i; + if (frames[0].bh == NULL) return; - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); + info = &((struct dx_root *)frames[0].bh->b_data)->info; + for (i = 0; i <= info->indirect_levels; i++) { + if (frames[i].bh == NULL) + break; + brelse(frames[i].bh); + frames[i].bh = NULL; + } } /* @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; @@ -1155,12 +1169,11 @@ errout: static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - fname, d_name, offset, res_dir); + fname, offset, res_dir); } /* @@ -1262,7 +1275,6 @@ static inline bool ext4_match(const struct ext4_filename *fname, */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; @@ -1355,7 +1367,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (ext4_has_inline_data(dir)) { int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir, + ret = ext4_find_inline_entry(dir, &fname, res_dir, &has_inline_data); if (has_inline_data) { if (inlined) @@ -1430,11 +1442,11 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); - goto next; + ret = ERR_PTR(-EIO); + goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, @@ -1444,10 +1456,11 @@ restart: EXT4_ERROR_INODE(dir, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); - goto next; + ret = ERR_PTR(-EFSBADCRC); + goto cleanup_and_exit; } set_buffer_verified(bh); - i = search_dirblock(bh, dir, &fname, d_name, + i = search_dirblock(bh, dir, &fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -1487,8 +1500,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_frame frames[2], *frame; - const struct qstr *d_name = fname->usr_fname; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1505,7 +1517,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, if (IS_ERR(bh)) goto errout; - retval = search_dirblock(bh, dir, fname, d_name, + retval = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) @@ -1530,7 +1542,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, bh = NULL; errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); success: dx_release(frames); return bh; @@ -1892,7 +1904,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); - dir->i_version++; + inode_inc_iversion(dir); ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirent_node(handle, dir, bh); @@ -1911,7 +1923,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, { struct buffer_head *bh2; struct dx_root *root; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; struct ext4_dir_entry_tail *t; @@ -2130,13 +2142,16 @@ out: static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; + int restart; int err; +again: + restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); @@ -2158,24 +2173,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, if (err != -ENOSPC) goto cleanup; + err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; + int levels = frame - frames + 1; + unsigned int icount; + int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext4_warning_inode(dir, "Directory index full!"); + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { + add_level = 0; + break; + } + frame--; /* split higher index block */ + at = frame->at; + entries = frame->entries; + restart = 1; + } + if (add_level && levels == ext4_dir_htree_level(sb)) { + ext4_warning(sb, "Directory (ino: %lu) index full, " + "reach max htree level :%d", + dir->i_ino, levels); + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { + ext4_warning(sb, "Large directory feature is " + "not enabled on this " + "filesystem"); + } err = -ENOSPC; goto cleanup; } + icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); @@ -2190,7 +2225,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, err = ext4_journal_get_write_access(handle, frame->bh); if (err) goto journal_error; - if (levels) { + if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", @@ -2198,7 +2233,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, - frames[0].bh); + (frame - 1)->bh); if (err) goto journal_error; @@ -2214,17 +2249,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, frame->entries = entries = entries2; swap(frame->bh, bh2); } - dx_insert_block(frames + 0, hash2, newblock); - dxtrace(dx_show_index("node", frames[1].entries)); + dx_insert_block((frame - 1), hash2, newblock); + dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); + err = ext4_handle_dirty_dx_node(handle, dir, + (frame - 1)->bh); + if (err) + goto journal_error; + if (restart) { + err = ext4_handle_dirty_dx_node(handle, dir, + frame->bh); + goto journal_error; + } } else { - dxtrace(printk(KERN_DEBUG - "Creating second level index...\n")); + struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); @@ -2232,22 +2275,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext4_journal_get_write_access(handle, - frame->bh); + dxroot = (struct dx_root *)frames[0].bh->b_data; + dxroot->info.indirect_levels += 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + info->indirect_levels)); + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; - } - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); - if (err) { - ext4_std_error(inode->i_sb, err); - goto cleanup; + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + brelse(bh2); + restart = 1; + goto journal_error; } } de = do_split(handle, dir, &bh, frame, &fname->hinfo); @@ -2259,10 +2298,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, goto cleanup; journal_error: - ext4_std_error(dir->i_sb, err); + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ cleanup: brelse(bh); dx_release(frames); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path + */ + if (restart && err == 0) + goto again; return err; } @@ -2299,7 +2343,7 @@ int ext4_generic_delete_entry(handle_t *handle, blocksize); else de->inode = 0; - dir->i_version++; + inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1a82138ba739..c2fce4478cca 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -85,7 +85,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); mapping_set_error(page->mapping, -EIO); } @@ -104,7 +104,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_error) + if (bio->bi_status) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -303,24 +303,25 @@ static void ext4_end_bio(struct bio *bio) bdevname(bio->bi_bdev, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), - bio->bi_error)) { + bio->bi_status)) { ext4_finish_bio(bio); bio_put(bio); return; } bio->bi_end_io = NULL; - if (bio->bi_error) { + if (bio->bi_status) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - bio->bi_error, inode->i_ino, + bio->bi_status, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, bio->bi_error); + mapping_set_error(inode->i_mapping, + blk_status_to_errno(bio->bi_status)); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -349,6 +350,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0; + io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } @@ -396,6 +398,7 @@ submit_and_retry: ret = io_submit_init_bio(io, bh); if (ret) return ret; + io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a81b829d56de..40a5497b0f60 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -73,7 +73,7 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -83,7 +83,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { SetPageUptodate(page); } else { ClearPageUptodate(page); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 96973ee74147..0886fe82e9c4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -37,6 +37,7 @@ #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> +#include <linux/dax.h> #include <linux/cleancache.h> #include <linux/uaccess.h> @@ -372,6 +373,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); + + ext4_process_freed_data(sb, txn->t_tid); + spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, @@ -847,14 +851,9 @@ static inline void ext4_quota_off_umount(struct super_block *sb) { int type; - if (ext4_has_feature_quota(sb)) { - dquot_disable(sb, -1, - DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - } else { - /* Use our quota_off function to clear inode flags etc. */ - for (type = 0; type < EXT4_MAXQUOTAS; type++) - ext4_quota_off(sb, type); - } + /* Use our quota_off function to clear inode flags etc. */ + for (type = 0; type < EXT4_MAXQUOTAS; type++) + ext4_quota_off(sb, type); } #else static inline void ext4_quota_off_umount(struct super_block *sb) @@ -931,9 +930,13 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } - if (sbi->s_mb_cache) { - ext4_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_inode_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + } + if (sbi->s_ea_block_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); @@ -1147,7 +1150,16 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; - int res, res2, retries = 0; + int res, res2, credits, retries = 0; + + /* + * Encrypting the root directory is not allowed because e2fsck expects + * lost+found to exist and be unencrypted, and encrypting the root + * directory would imply encrypting the lost+found directory as well as + * the filename "lost+found" itself. + */ + if (inode->i_ino == EXT4_ROOT_INO) + return -EPERM; res = ext4_convert_inline_data(inode); if (res) @@ -1178,9 +1190,16 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, return res; } + res = dquot_initialize(inode); + if (res) + return res; retry: - handle = ext4_journal_start(inode, EXT4_HT_MISC, - ext4_jbd2_credits_xattr(inode)); + res = ext4_xattr_set_credits(inode, len, false /* is_create */, + &credits); + if (res) + return res; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1204,7 +1223,7 @@ retry: return res; } -static int ext4_dummy_context(struct inode *inode) +static bool ext4_dummy_context(struct inode *inode) { return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); } @@ -1257,16 +1276,17 @@ static struct dquot **ext4_get_dquots(struct inode *inode) } static const struct dquot_operations ext4_quota_operations = { - .get_reserved_space = ext4_get_reserved_space, - .write_dquot = ext4_write_dquot, - .acquire_dquot = ext4_acquire_dquot, - .release_dquot = ext4_release_dquot, - .mark_dirty = ext4_mark_dquot_dirty, - .write_info = ext4_write_info, - .alloc_dquot = dquot_alloc, - .destroy_dquot = dquot_destroy, - .get_projid = ext4_get_projid, - .get_next_id = ext4_get_next_id, + .get_reserved_space = ext4_get_reserved_space, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, + .get_inode_usage = ext4_get_inode_usage, + .get_next_id = ext4_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -1329,7 +1349,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, - Opt_max_dir_size_kb, Opt_nojournal_checksum, + Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, }; static const match_table_t tokens = { @@ -1412,6 +1432,8 @@ static const match_table_t tokens = { {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_nombcache, "nombcache"}, + {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1619,6 +1641,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -2155,7 +2178,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) return 0; size = roundup_pow_of_two(size * sizeof(struct flex_groups)); - new_groups = ext4_kvzalloc(size, GFP_KERNEL); + new_groups = kvzalloc(size, GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size / (int) sizeof(struct flex_groups)); @@ -3446,7 +3469,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Load the checksum driver */ - if (ext4_has_feature_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) { sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); @@ -3468,7 +3492,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb)) + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); @@ -3598,6 +3622,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "The Hurd can't support 64-bit file systems"); goto failed_mount; } + + /* + * ea_inode feature uses l_i_version field which is not + * available in HURD_COMPAT mode. + */ + if (ext4_has_feature_ea_inode(sb)) { + ext4_msg(sb, KERN_ERR, + "ea_inode feature is not supported for Hurd"); + goto failed_mount; + } } if (IS_EXT2_SB(sb)) { @@ -3889,7 +3923,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } } - sbi->s_group_desc = ext4_kvmalloc(db_count * + sbi->s_group_desc = kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { @@ -3951,7 +3985,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); @@ -4062,10 +4096,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; no_journal: - sbi->s_mb_cache = ext4_xattr_create_cache(); - if (!sbi->s_mb_cache) { - ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); - goto failed_mount_wq; + if (!test_opt(sb, NO_MBCACHE)) { + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_block_cache"); + goto failed_mount_wq; + } + + if (ext4_has_feature_ea_inode(sb)) { + sbi->s_ea_inode_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_inode_cache) { + ext4_msg(sb, KERN_ERR, + "Failed to create ea_inode_cache"); + goto failed_mount_wq; + } + } } if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && @@ -4297,9 +4343,13 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: - if (sbi->s_mb_cache) { - ext4_xattr_destroy_cache(sbi->s_mb_cache); - sbi->s_mb_cache = NULL; + if (sbi->s_ea_inode_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + } + if (sbi->s_ea_block_cache) { + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; } if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -4958,6 +5008,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { + ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); + err = -EINVAL; + goto restore_opts; + } + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); @@ -5484,7 +5540,7 @@ static int ext4_quota_off(struct super_block *sb, int type) goto out; err = dquot_quota_off(sb, type); - if (err) + if (err || ext4_has_feature_quota(sb)) goto out_put; inode_lock(inode); @@ -5504,6 +5560,7 @@ static int ext4_quota_off(struct super_block *sb, int type) out_unlock: inode_unlock(inode); out_put: + lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); iput(inode); return err; out: diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d74dc5f81a04..48c7a7d55ed3 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -100,7 +100,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (!ret || val >= clusters) + if (ret || val >= clusters) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8fb7ce14e6eb..cff4f41ced61 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -72,12 +72,14 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); -static struct buffer_head *ext4_xattr_cache_find(struct inode *, - struct ext4_xattr_header *, - struct mb_cache_entry **); -static void ext4_xattr_rehash(struct ext4_xattr_header *, - struct ext4_xattr_entry *); +static void ext4_xattr_block_cache_insert(struct mb_cache *, + struct buffer_head *); +static struct buffer_head * +ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, + struct mb_cache_entry **); +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count); +static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, @@ -104,8 +106,22 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; -#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ - inode->i_sb->s_fs_info)->s_mb_cache) +#define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_block_cache) + +#define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_ea_inode_cache) + +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode); + +#ifdef CONFIG_LOCKDEP +void ext4_xattr_inode_set_class(struct inode *ea_inode) +{ + lockdep_set_subclass(&ea_inode->i_rwsem, 1); +} +#endif static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, @@ -177,9 +193,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, /* Check the values */ while (!IS_LAST_ENTRY(entry)) { - if (entry->e_value_block != 0) - return -EFSCORRUPTED; - if (entry->e_value_size != 0) { + if (entry->e_value_size != 0 && + entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); u32 size = le32_to_cpu(entry->e_value_size); void *value; @@ -269,6 +284,185 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, return cmp ? -ENODATA : 0; } +static u32 +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) +{ + return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); +} + +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) +{ + return ((u64)ea_inode->i_ctime.tv_sec << 32) | + ((u32)ea_inode->i_version); +} + +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) +{ + ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); + ea_inode->i_version = (u32)ref_count; +} + +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) +{ + return (u32)ea_inode->i_atime.tv_sec; +} + +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) +{ + ea_inode->i_atime.tv_sec = hash; +} + +/* + * Read the EA value from an inode. + */ +static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) +{ + unsigned long block = 0; + struct buffer_head *bh; + int blocksize = ea_inode->i_sb->s_blocksize; + size_t csize, copied = 0; + void *copy_pos = buf; + + while (copied < size) { + csize = (size - copied) > blocksize ? blocksize : size - copied; + bh = ext4_bread(NULL, ea_inode, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + if (!bh) + return -EFSCORRUPTED; + + memcpy(copy_pos, bh->b_data, csize); + brelse(bh); + + copy_pos += csize; + block += 1; + copied += csize; + } + return 0; +} + +static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + struct inode **ea_inode) +{ + struct inode *inode; + int err; + + inode = ext4_iget(parent->i_sb, ea_ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error(parent->i_sb, + "error while reading EA inode %lu err=%d", ea_ino, + err); + return err; + } + + if (is_bad_inode(inode)) { + ext4_error(parent->i_sb, + "error while reading EA inode %lu is_bad_inode", + ea_ino); + err = -EIO; + goto error; + } + + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + ext4_error(parent->i_sb, + "EA inode %lu does not have EXT4_EA_INODE_FL flag", + ea_ino); + err = -EINVAL; + goto error; + } + + *ea_inode = inode; + return 0; +error: + iput(inode); + return err; +} + +static int +ext4_xattr_inode_verify_hashes(struct inode *ea_inode, + struct ext4_xattr_entry *entry, void *buffer, + size_t size) +{ + u32 hash; + + /* Verify stored hash matches calculated hash. */ + hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); + if (hash != ext4_xattr_inode_get_hash(ea_inode)) + return -EFSCORRUPTED; + + if (entry) { + __le32 e_hash, tmp_data; + + /* Verify entry hash. */ + tmp_data = cpu_to_le32(hash); + e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, + &tmp_data, 1); + if (e_hash != entry->e_hash) + return -EFSCORRUPTED; + } + return 0; +} + +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) + +/* + * Read xattr value from the EA inode. + */ +static int +ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, + void *buffer, size_t size) +{ + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + struct inode *ea_inode; + int err; + + err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), + &ea_inode); + if (err) { + ea_inode = NULL; + goto out; + } + + if (i_size_read(ea_inode) != size) { + ext4_warning_inode(ea_inode, + "ea_inode file size=%llu entry size=%zu", + i_size_read(ea_inode), size); + err = -EFSCORRUPTED; + goto out; + } + + err = ext4_xattr_inode_read(ea_inode, buffer, size); + if (err) + goto out; + + err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); + /* + * Compatibility check for old Lustre ea_inode implementation. Old + * version does not have hash validation, but it has a backpointer + * from ea_inode to the parent inode. + */ + if (err == -EFSCORRUPTED) { + if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino || + ea_inode->i_generation != inode->i_generation) { + ext4_warning_inode(ea_inode, + "EA inode hash validation failed"); + goto out; + } + /* Do not add ea_inode to the cache. */ + ea_inode_cache = NULL; + } else if (err) + goto out; + + if (ea_inode_cache) + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, + ext4_xattr_inode_get_hash(ea_inode), + ea_inode->i_ino, true /* reusable */); +out: + iput(ea_inode); + return err; +} + static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) @@ -277,7 +471,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -298,7 +492,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -EFSCORRUPTED; goto cleanup; } - ext4_xattr_cache_insert(ext4_mb_cache, bh); + ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, 1); if (error) @@ -308,8 +502,15 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), - size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, + size); + if (error) + goto cleanup; + } else { + memcpy(buffer, bh->b_data + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -350,8 +551,15 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, + size); + if (error) + goto cleanup; + } else { + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -428,7 +636,6 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -450,7 +657,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = -EFSCORRUPTED; goto cleanup; } - ext4_xattr_cache_insert(ext4_mb_cache, bh); + ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: @@ -539,15 +746,445 @@ static void ext4_xattr_update_super_block(handle_t *handle, } } +int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) +{ + struct ext4_iloc iloc = { .bh = NULL }; + struct buffer_head *bh = NULL; + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + qsize_t ea_inode_refs = 0; + void *end; + int ret; + + lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); + + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + ret = xattr_check_inode(inode, header, end); + if (ret) + goto out; + + for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + ret = -EIO; + goto out; + } + + if (ext4_xattr_check_block(inode, bh)) { + ret = -EFSCORRUPTED; + goto out; + } + + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ea_inode_refs++; + } + *usage = ea_inode_refs + 1; + ret = 0; +out: + brelse(iloc.bh); + brelse(bh); + return ret; +} + +static inline size_t round_up_cluster(struct inode *inode, size_t length) +{ + struct super_block *sb = inode->i_sb; + size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + + inode->i_blkbits); + size_t mask = ~(cluster_size - 1); + + return (length + cluster_size - 1) & mask; +} + +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) +{ + int err; + + err = dquot_alloc_inode(inode); + if (err) + return err; + err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); + if (err) + dquot_free_inode(inode); + return err; +} + +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len) +{ + dquot_free_space_nodirty(inode, round_up_cluster(inode, len)); + dquot_free_inode(inode); +} + +int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create) +{ + int credits; + int blocks; + + /* + * 1) Owner inode update + * 2) Ref count update on old xattr block + * 3) new xattr block + * 4) block bitmap update for new xattr block + * 5) group descriptor for new xattr block + * 6) block bitmap update for old xattr block + * 7) group descriptor for old block + * + * 6 & 7 can happen if we have two racing threads T_a and T_b + * which are each trying to set an xattr on inodes I_a and I_b + * which were both initially sharing an xattr block. + */ + credits = 7; + + /* Quota updates. */ + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (inode && ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + + /* We are done if ea_inode feature is not enabled. */ + if (!ext4_has_feature_ea_inode(sb)) + return credits; + + /* New ea_inode, inode map, block bitmap, group descriptor. */ + credits += 4; + + /* Data blocks. */ + blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree. */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + + /* Blocks themselves. */ + credits += blocks; + + if (!is_create) { + /* Dereference ea_inode holding old xattr value. + * Old ea_inode, inode map, block bitmap, group descriptor. + */ + credits += 4; + + /* Data blocks for old ea_inode. */ + blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; + + /* Indirection block or one level of extent tree for old + * ea_inode. + */ + blocks += 1; + + /* Block bitmap and group descriptor updates for each block. */ + credits += blocks * 2; + } + + /* We may need to clone the existing xattr block in which case we need + * to increment ref counts for existing ea_inodes referenced by it. + */ + if (block_bh) { + struct ext4_xattr_entry *entry = BFIRST(block_bh); + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + /* Ref count update on ea_inode. */ + credits += 1; + } + return credits; +} + +static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, + int credits, struct buffer_head *bh, + bool dirty, bool block_csum) +{ + int error; + + if (!ext4_handle_valid(handle)) + return 0; + + if (handle->h_buffer_credits >= credits) + return 0; + + error = ext4_journal_extend(handle, credits - handle->h_buffer_credits); + if (!error) + return 0; + if (error < 0) { + ext4_warning(inode->i_sb, "Extend journal (error %d)", error); + return error; + } + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + + error = ext4_journal_restart(handle, credits); + if (error) { + ext4_warning(inode->i_sb, "Restart journal (error %d)", error); + return error; + } + + if (bh) { + error = ext4_journal_get_write_access(handle, bh); + if (error) { + ext4_warning(inode->i_sb, + "Get write access failed (error %d)", + error); + return error; + } + } + return 0; +} + +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, + int ref_change) +{ + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode); + struct ext4_iloc iloc; + s64 ref_count; + u32 hash; + int ret; + + inode_lock(ea_inode); + + ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); + if (ret) { + iloc.bh = NULL; + goto out; + } + + ref_count = ext4_xattr_inode_get_ref(ea_inode); + ref_count += ref_change; + ext4_xattr_inode_set_ref(ea_inode, ref_count); + + if (ref_change > 0) { + WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", + ea_inode->i_ino, ref_count); + + if (ref_count == 1) { + WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + set_nlink(ea_inode, 1); + ext4_orphan_del(handle, ea_inode); + + if (ea_inode_cache) { + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_create(ea_inode_cache, + GFP_NOFS, hash, + ea_inode->i_ino, + true /* reusable */); + } + } + } else { + WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", + ea_inode->i_ino, ref_count); + + if (ref_count == 0) { + WARN_ONCE(ea_inode->i_nlink != 1, + "EA inode %lu i_nlink=%u", + ea_inode->i_ino, ea_inode->i_nlink); + + clear_nlink(ea_inode); + ext4_orphan_add(handle, ea_inode); + + if (ea_inode_cache) { + hash = ext4_xattr_inode_get_hash(ea_inode); + mb_cache_entry_delete(ea_inode_cache, hash, + ea_inode->i_ino); + } + } + } + + ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); + iloc.bh = NULL; + if (ret) + ext4_warning_inode(ea_inode, + "ext4_mark_iloc_dirty() failed ret=%d", ret); +out: + brelse(iloc.bh); + inode_unlock(ea_inode); + return ret; +} + +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, 1); +} + +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) +{ + return ext4_xattr_inode_update_ref(handle, ea_inode, -1); +} + +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, + struct ext4_xattr_entry *first) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *failed_entry; + unsigned int ea_ino; + int err, saved_err; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) + goto cleanup; + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "inc ref error %d", err); + iput(ea_inode); + goto cleanup; + } + iput(ea_inode); + } + return 0; + +cleanup: + saved_err = err; + failed_entry = entry; + + for (entry = first; entry != failed_entry; + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) { + ext4_warning(parent->i_sb, + "cleanup ea_ino %u iget error %d", ea_ino, + err); + continue; + } + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", + err); + iput(ea_inode); + } + return saved_err; +} + +static void +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, + struct buffer_head *bh, + struct ext4_xattr_entry *first, bool block_csum, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits, bool skip_quota) +{ + struct inode *ea_inode; + struct ext4_xattr_entry *entry; + bool dirty = false; + unsigned int ea_ino; + int err; + int credits; + + /* One credit for dec ref on ea_inode, one for orphan list addition, */ + credits = 2 + extra_credits; + + for (entry = first; !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + ea_ino = le32_to_cpu(entry->e_value_inum); + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + if (err) + continue; + + err = ext4_expand_inode_array(ea_inode_array, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, + "Expand inode array err=%d", err); + iput(ea_inode); + continue; + } + + err = ext4_xattr_ensure_credits(handle, parent, credits, bh, + dirty, block_csum); + if (err) { + ext4_warning_inode(ea_inode, "Ensure credits err=%d", + err); + continue; + } + + err = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (err) { + ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", + err); + continue; + } + + if (!skip_quota) + ext4_xattr_inode_free_quota(parent, + le32_to_cpu(entry->e_value_size)); + + /* + * Forget about ea_inode within the same transaction that + * decrements the ref count. This avoids duplicate decrements in + * case the rest of the work spills over to subsequent + * transactions. + */ + entry->e_value_inum = 0; + entry->e_value_size = 0; + + dirty = true; + } + + if (dirty) { + /* + * Note that we are deliberately skipping csum calculation for + * the final update because we do not expect any journal + * restarts until xattr block is freed. + */ + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + ext4_warning_inode(parent, + "handle dirty metadata err=%d", err); + } +} + /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, - struct buffer_head *bh) + struct buffer_head *bh, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; @@ -565,9 +1202,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); + if (ea_block_cache) + mb_cache_entry_delete(ea_block_cache, hash, + bh->b_blocknr); get_bh(bh); unlock_buffer(bh); + + if (ext4_has_feature_ea_inode(inode->i_sb)) + ext4_xattr_inode_dec_ref_all(handle, inode, bh, + BFIRST(bh), + true /* block_csum */, + ea_inode_array, + extra_credits, + true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -577,11 +1224,13 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; - ce = mb_cache_entry_get(ext4_mb_cache, hash, - bh->b_blocknr); - if (ce) { - ce->e_reusable = 1; - mb_cache_entry_put(ext4_mb_cache, ce); + if (ea_block_cache) { + ce = mb_cache_entry_get(ea_block_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; + mb_cache_entry_put(ea_block_cache, ce); + } } } @@ -620,7 +1269,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -631,113 +1280,454 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, return (*min_offs - ((void *)last - base) - sizeof(__u32)); } -static int -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +/* + * Write the value of the EA in an inode. + */ +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, + const void *buf, int bufsize) +{ + struct buffer_head *bh = NULL; + unsigned long block = 0; + int blocksize = ea_inode->i_sb->s_blocksize; + int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; + int csize, wsize = 0; + int ret = 0; + int retries = 0; + +retry: + while (ret >= 0 && ret < max_blocks) { + struct ext4_map_blocks map; + map.m_lblk = block += ret; + map.m_len = max_blocks -= ret; + + ret = ext4_map_blocks(handle, ea_inode, &map, + EXT4_GET_BLOCKS_CREATE); + if (ret <= 0) { + ext4_mark_inode_dirty(handle, ea_inode); + if (ret == -ENOSPC && + ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + break; + } + } + + if (ret < 0) + return ret; + + block = 0; + while (wsize < bufsize) { + if (bh != NULL) + brelse(bh); + csize = (bufsize - wsize) > blocksize ? blocksize : + bufsize - wsize; + bh = ext4_getblk(handle, ea_inode, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (ret) + goto out; + + memcpy(bh->b_data, buf, csize); + set_buffer_uptodate(bh); + ext4_handle_dirty_metadata(handle, ea_inode, bh); + + buf += csize; + wsize += csize; + block += 1; + } + + inode_lock(ea_inode); + i_size_write(ea_inode, wsize); + ext4_update_i_disksize(ea_inode, wsize); + inode_unlock(ea_inode); + + ext4_mark_inode_dirty(handle, ea_inode); + +out: + brelse(bh); + + return ret; +} + +/* + * Create an inode to store the value of a large EA. + */ +static struct inode *ext4_xattr_inode_create(handle_t *handle, + struct inode *inode, u32 hash) +{ + struct inode *ea_inode = NULL; + uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; + int err; + + /* + * Let the next inode be the goal, so we try and allocate the EA inode + * in the same group, or nearby one. + */ + ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG | 0600, NULL, inode->i_ino + 1, owner, + EXT4_EA_INODE_FL); + if (!IS_ERR(ea_inode)) { + ea_inode->i_op = &ext4_file_inode_operations; + ea_inode->i_fop = &ext4_file_operations; + ext4_set_aops(ea_inode); + ext4_xattr_inode_set_class(ea_inode); + unlock_new_inode(ea_inode); + ext4_xattr_inode_set_ref(ea_inode, 1); + ext4_xattr_inode_set_hash(ea_inode, hash); + err = ext4_mark_inode_dirty(handle, ea_inode); + if (!err) + err = ext4_inode_attach_jinode(ea_inode); + if (err) { + iput(ea_inode); + return ERR_PTR(err); + } + + /* + * Xattr inodes are shared therefore quota charging is performed + * at a higher level. + */ + dquot_free_inode(ea_inode); + dquot_drop(ea_inode); + inode_lock(ea_inode); + ea_inode->i_flags |= S_NOQUOTA; + inode_unlock(ea_inode); + } + + return ea_inode; +} + +static struct inode * +ext4_xattr_inode_cache_find(struct inode *inode, const void *value, + size_t value_len, u32 hash) +{ + struct inode *ea_inode; + struct mb_cache_entry *ce; + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); + void *ea_data; + + if (!ea_inode_cache) + return NULL; + + ce = mb_cache_entry_find_first(ea_inode_cache, hash); + if (!ce) + return NULL; + + ea_data = ext4_kvmalloc(value_len, GFP_NOFS); + if (!ea_data) { + mb_cache_entry_put(ea_inode_cache, ce); + return NULL; + } + + while (ce) { + ea_inode = ext4_iget(inode->i_sb, ce->e_value); + if (!IS_ERR(ea_inode) && + !is_bad_inode(ea_inode) && + (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && + i_size_read(ea_inode) == value_len && + !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && + !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, + value_len) && + !memcmp(value, ea_data, value_len)) { + mb_cache_entry_touch(ea_inode_cache, ce); + mb_cache_entry_put(ea_inode_cache, ce); + kvfree(ea_data); + return ea_inode; + } + + if (!IS_ERR(ea_inode)) + iput(ea_inode); + ce = mb_cache_entry_find_next(ea_inode_cache, ce); + } + kvfree(ea_data); + return NULL; +} + +/* + * Add value of the EA in an inode. + */ +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, + const void *value, size_t value_len, + struct inode **ret_inode) +{ + struct inode *ea_inode; + u32 hash; + int err; + + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); + ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); + if (ea_inode) { + err = ext4_xattr_inode_inc_ref(handle, ea_inode); + if (err) { + iput(ea_inode); + return err; + } + + *ret_inode = ea_inode; + return 0; + } + + /* Create an inode for the EA value */ + ea_inode = ext4_xattr_inode_create(handle, inode, hash); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + + err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); + if (err) { + ext4_xattr_inode_dec_ref(handle, ea_inode); + iput(ea_inode); + return err; + } + + if (EA_INODE_CACHE(inode)) + mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, + ea_inode->i_ino, true /* reusable */); + + *ret_inode = ea_inode; + return 0; +} + +/* + * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode + * feature is enabled. + */ +#define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) + +static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + struct ext4_xattr_search *s, + handle_t *handle, struct inode *inode, + bool is_block) { struct ext4_xattr_entry *last; - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + struct ext4_xattr_entry *here = s->here; + size_t min_offs = s->end - s->base, name_len = strlen(i->name); + int in_inode = i->in_inode; + struct inode *old_ea_inode = NULL; + struct inode *new_ea_inode = NULL; + size_t old_size, new_size; + int ret; + + /* Space used by old and new values. */ + old_size = (!s->not_found && !here->e_value_inum) ? + EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; + new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; + + /* + * Optimization for the simple case when old and new values have the + * same padded sizes. Not applicable if external inodes are involved. + */ + if (new_size && new_size == old_size) { + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + here->e_value_size = cpu_to_le32(i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, new_size); + } else { + memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, new_size - i->value_len); + } + return 0; + } /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; } } - free = min_offs - ((void *)last - s->base) - sizeof(__u32); - if (!s->not_found) { - if (s->here->e_value_size) { - size_t size = le32_to_cpu(s->here->e_value_size); - free += EXT4_XATTR_SIZE(size); - } - free += EXT4_XATTR_LEN(name_len); - } + + /* Check whether we have enough space. */ if (i->value) { - if (free < EXT4_XATTR_LEN(name_len) + - EXT4_XATTR_SIZE(i->value_len)) - return -ENOSPC; + size_t free; + + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) + free += EXT4_XATTR_LEN(name_len) + old_size; + + if (free < EXT4_XATTR_LEN(name_len) + new_size) { + ret = -ENOSPC; + goto out; + } + + /* + * If storing the value in an external inode is an option, + * reserve space for xattr entries/names in the external + * attribute block so that a long value does not occupy the + * whole space and prevent futher entries being added. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + new_size && is_block && + (min_offs + old_size - new_size) < + EXT4_XATTR_BLOCK_RESERVE(inode)) { + ret = -ENOSPC; + goto out; + } } - if (i->value && s->not_found) { - /* Insert the new name. */ - size_t size = EXT4_XATTR_LEN(name_len); - size_t rest = (void *)last - (void *)s->here + sizeof(__u32); - memmove((void *)s->here + size, s->here, rest); - memset(s->here, 0, size); - s->here->e_name_index = i->name_index; - s->here->e_name_len = name_len; - memcpy(s->here->e_name, i->name, name_len); - } else { - if (s->here->e_value_size) { - void *first_val = s->base + min_offs; - size_t offs = le16_to_cpu(s->here->e_value_offs); - void *val = s->base + offs; - size_t size = EXT4_XATTR_SIZE( - le32_to_cpu(s->here->e_value_size)); - - if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { - /* The old and the new value have the same - size. Just replace. */ - s->here->e_value_size = - cpu_to_le32(i->value_len); - if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); - } else { - /* Clear pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); - memcpy(val, i->value, i->value_len); - } - return 0; - } + /* + * Getting access to old and new ea inodes is subject to failures. + * Finish that work before doing any modifications to the xattr data. + */ + if (!s->not_found && here->e_value_inum) { + ret = ext4_xattr_inode_iget(inode, + le32_to_cpu(here->e_value_inum), + &old_ea_inode); + if (ret) { + old_ea_inode = NULL; + goto out; + } + } + if (i->value && in_inode) { + WARN_ON_ONCE(!i->value_len); + + ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); + if (ret) + goto out; + + ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, + i->value_len, + &new_ea_inode); + if (ret) { + new_ea_inode = NULL; + ext4_xattr_inode_free_quota(inode, i->value_len); + goto out; + } + } - /* Remove the old value. */ - memmove(first_val + size, first_val, val - first_val); - memset(first_val, 0, size); - s->here->e_value_size = 0; - s->here->e_value_offs = 0; - min_offs += size; - - /* Adjust all value offsets. */ - last = s->first; - while (!IS_LAST_ENTRY(last)) { - size_t o = le16_to_cpu(last->e_value_offs); - if (last->e_value_size && o < offs) - last->e_value_offs = - cpu_to_le16(o + size); - last = EXT4_XATTR_NEXT(last); + if (old_ea_inode) { + /* We are ready to release ref count on the old_ea_inode. */ + ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); + if (ret) { + /* Release newly required ref count on new_ea_inode. */ + if (new_ea_inode) { + int err; + + err = ext4_xattr_inode_dec_ref(handle, + new_ea_inode); + if (err) + ext4_warning_inode(new_ea_inode, + "dec ref new_ea_inode err=%d", + err); + ext4_xattr_inode_free_quota(inode, + i->value_len); } + goto out; } - if (!i->value) { - /* Remove the old name. */ - size_t size = EXT4_XATTR_LEN(name_len); - last = ENTRY((void *)last - size); - memmove(s->here, (void *)s->here + size, - (void *)last - (void *)s->here + sizeof(__u32)); - memset(last, 0, size); + + ext4_xattr_inode_free_quota(inode, + le32_to_cpu(here->e_value_size)); + } + + /* No failures allowed past this point. */ + + if (!s->not_found && here->e_value_offs) { + /* Remove the old value. */ + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(here->e_value_offs); + void *val = s->base + offs; + + memmove(first_val + old_size, first_val, val - first_val); + memset(first_val, 0, old_size); + min_offs += old_size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + + if (!last->e_value_inum && + last->e_value_size && o < offs) + last->e_value_offs = cpu_to_le16(o + old_size); + last = EXT4_XATTR_NEXT(last); } } + if (!i->value) { + /* Remove old name. */ + size_t size = EXT4_XATTR_LEN(name_len); + + last = ENTRY((void *)last - size); + memmove(here, (void *)here + size, + (void *)last - (void *)here + sizeof(__u32)); + memset(last, 0, size); + } else if (s->not_found) { + /* Insert new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)here + sizeof(__u32); + + memmove((void *)here + size, here, rest); + memset(here, 0, size); + here->e_name_index = i->name_index; + here->e_name_len = name_len; + memcpy(here->e_name, i->name, name_len); + } else { + /* This is an update, reset value info. */ + here->e_value_inum = 0; + here->e_value_offs = 0; + here->e_value_size = 0; + } + if (i->value) { - /* Insert the new value. */ - s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value_len) { - size_t size = EXT4_XATTR_SIZE(i->value_len); - void *val = s->base + min_offs - size; - s->here->e_value_offs = cpu_to_le16(min_offs - size); + /* Insert new value. */ + if (in_inode) { + here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); + } else if (i->value_len) { + void *val = s->base + min_offs - new_size; + + here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { - memset(val, 0, size); + memset(val, 0, new_size); } else { - /* Clear the pad bytes first. */ - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); memcpy(val, i->value, i->value_len); + /* Clear padding bytes. */ + memset(val + i->value_len, 0, + new_size - i->value_len); } } + here->e_value_size = cpu_to_le32(i->value_len); } - return 0; + + if (i->value) { + __le32 hash = 0; + + /* Entry hash calculation. */ + if (in_inode) { + __le32 crc32c_hash; + + /* + * Feed crc32c hash instead of the raw value for entry + * hash calculation. This is to avoid walking + * potentially long value buffer again. + */ + crc32c_hash = cpu_to_le32( + ext4_xattr_inode_get_hash(new_ea_inode)); + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, + &crc32c_hash, 1); + } else if (is_block) { + __le32 *value = s->base + min_offs - new_size; + + hash = ext4_xattr_hash_entry(here->e_name, + here->e_name_len, value, + new_size >> 2); + } + here->e_hash = hash; + } + + if (is_block) + ext4_xattr_rehash((struct ext4_xattr_header *)s->base); + + ret = 0; +out: + iput(old_ea_inode); + iput(new_ea_inode); + return ret; } struct ext4_xattr_block_find { @@ -794,15 +1784,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; - struct ext4_xattr_search *s = &bs->s; + struct ext4_xattr_search s_copy = bs->s; + struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + struct inode *ea_inode = NULL; + size_t old_ea_inode_size = 0; #define header(x) ((struct ext4_xattr_header *)(x)) - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; if (s->base) { BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); @@ -818,17 +1809,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - mb_cache_entry_delete_block(ext4_mb_cache, hash, - bs->bh->b_blocknr); + if (ea_block_cache) + mb_cache_entry_delete(ea_block_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); - error = ext4_xattr_set_entry(i, s); - if (!error) { - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), - s->here); - ext4_xattr_cache_insert(ext4_mb_cache, - bs->bh); - } + error = ext4_xattr_set_entry(i, s, handle, inode, + true /* is_block */); + if (!error) + ext4_xattr_block_cache_insert(ea_block_cache, + bs->bh); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -854,6 +1843,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; + + /* + * If existing entry points to an xattr inode, we need + * to prevent ext4_xattr_set_entry() from decrementing + * ref count on it because the reference belongs to the + * original block. In this case, make the entry look + * like it has an empty value. + */ + if (!s->not_found && s->here->e_value_inum) { + /* + * Defer quota free call for previous inode + * until success is guaranteed. + */ + old_ea_inode_size = le32_to_cpu( + s->here->e_value_size); + s->here->e_value_inum = 0; + s->here->e_value_size = 0; + } } } else { /* Allocate a buffer where we construct the new block. */ @@ -870,17 +1877,33 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), s->here); + + if (i->value && s->here->e_value_inum) { + unsigned int ea_ino; + + /* + * A ref count on ea_inode has been taken as part of the call to + * ext4_xattr_set_entry() above. We would like to drop this + * extra ref but we have to wait until the xattr block is + * initialized and has its own ref count on the ea_inode. + */ + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + if (error) { + ea_inode = NULL; + goto cleanup; + } + } inserted: if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), + &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -888,6 +1911,8 @@ inserted: else { u32 ref; + WARN_ON_ONCE(dquot_initialize_needed(inode)); + /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, @@ -923,7 +1948,7 @@ inserted: EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; @@ -942,8 +1967,8 @@ inserted: if (error) goto cleanup_dquot; } - mb_cache_entry_touch(ext4_mb_cache, ce); - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_touch(ea_block_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -954,6 +1979,8 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal, block; + WARN_ON_ONCE(dquot_initialize_needed(inode)); + goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); @@ -980,6 +2007,22 @@ getblk_failed: EXT4_FREE_BLOCKS_METADATA); goto cleanup; } + error = ext4_xattr_inode_inc_ref_all(handle, inode, + ENTRY(header(s->base)+1)); + if (error) + goto getblk_failed; + if (ea_inode) { + /* Drop the extra ref on ea_inode. */ + error = ext4_xattr_inode_dec_ref(handle, + ea_inode); + if (error) + ext4_warning_inode(ea_inode, + "dec ref error=%d", + error); + iput(ea_inode); + ea_inode = NULL; + } + lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, new_bh); if (error) { @@ -991,7 +2034,7 @@ getblk_failed: ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext4_xattr_cache_insert(ext4_mb_cache, new_bh); + ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) @@ -999,17 +2042,40 @@ getblk_failed: } } + if (old_ea_inode_size) + ext4_xattr_inode_free_quota(inode, old_ea_inode_size); + /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ - if (bs->bh && bs->bh != new_bh) - ext4_xattr_release_block(handle, inode, bs->bh); + if (bs->bh && bs->bh != new_bh) { + struct ext4_xattr_inode_array *ea_inode_array = NULL; + + ext4_xattr_release_block(handle, inode, bs->bh, + &ea_inode_array, + 0 /* extra_credits */); + ext4_xattr_inode_array_free(ea_inode_array); + } error = 0; cleanup: + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + /* If there was an error, revert the quota charge. */ + if (error) + ext4_xattr_inode_free_quota(inode, + i_size_read(ea_inode)); + iput(ea_inode); + } if (ce) - mb_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1066,7 +2132,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) { if (error == -ENOSPC && ext4_has_inline_data(inode)) { @@ -1078,7 +2144,8 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_find(inode, i, is); if (error) return error; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, + false /* is_block */); } if (error) return error; @@ -1094,7 +2161,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, return 0; } -static int ext4_xattr_ibody_set(struct inode *inode, +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { @@ -1104,7 +2171,7 @@ static int ext4_xattr_ibody_set(struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); @@ -1123,12 +2190,31 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s, { void *value; + /* When e_value_inum is set the value is stored externally. */ + if (s->here->e_value_inum) + return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); return !memcmp(value, i->value, i->value_len); } +static struct buffer_head *ext4_xattr_get_block(struct inode *inode) +{ + struct buffer_head *bh; + int error; + + if (!EXT4_I(inode)->i_file_acl) + return NULL; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) + return ERR_PTR(-EIO); + error = ext4_xattr_check_block(inode, bh); + if (error) + return ERR_PTR(error); + return bh; +} + /* * ext4_xattr_set_handle() * @@ -1151,7 +2237,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, .name = name, .value = value, .value_len = value_len, - + .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, @@ -1166,8 +2252,31 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, return -EINVAL; if (strlen(name) > 255) return -ERANGE; + ext4_write_lock_xattr(inode, &no_expand); + /* Check journal credits under write lock. */ + if (ext4_handle_valid(handle)) { + struct buffer_head *bh; + int credits; + + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + goto cleanup; + } + + credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, + flags & XATTR_CREATE); + brelse(bh); + + if (!ext4_handle_has_enough_credits(handle, credits)) { + error = -ENOSPC; + goto cleanup; + } + } + error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; @@ -1197,9 +2306,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } + if (!value) { if (!is.s.not_found) - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { @@ -1210,7 +2320,12 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; - error = ext4_xattr_ibody_set(inode, &i, &is); + if (ext4_has_feature_ea_inode(inode->i_sb) && + (EXT4_XATTR_SIZE(i.value_len) > + EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) + i.in_inode = 1; +retry_inode: + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); @@ -1221,11 +2336,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); - if (error) - goto cleanup; - if (!is.s.not_found) { + if (!error && !is.s.not_found) { i.value = NULL; - error = ext4_xattr_ibody_set(inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); + } else if (error == -ENOSPC) { + /* + * Xattr does not fit in the block, store at + * external inode if possible. + */ + if (ext4_has_feature_ea_inode(inode->i_sb) && + !i.in_inode) { + i.in_inode = 1; + goto retry_inode; + } } } } @@ -1251,6 +2375,33 @@ cleanup: return error; } +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits) +{ + struct buffer_head *bh; + int err; + + *credits = 0; + + if (!EXT4_SB(inode->i_sb)->s_journal) + return 0; + + down_read(&EXT4_I(inode)->xattr_sem); + + bh = ext4_xattr_get_block(inode); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + } else { + *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, + value_len, is_create); + brelse(bh); + err = 0; + } + + up_read(&EXT4_I(inode)->xattr_sem); + return err; +} + /* * ext4_xattr_set() * @@ -1264,10 +2415,20 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; + struct super_block *sb = inode->i_sb; int error, retries = 0; - int credits = ext4_jbd2_credits_xattr(inode); + int credits; + + error = dquot_initialize(inode); + if (error) + return error; retry: + error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, + &credits); + if (error) + return error; + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -1278,7 +2439,7 @@ retry: value, value_len, flags); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; @@ -1303,7 +2464,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); @@ -1323,18 +2484,16 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; - size_t value_offs, value_size; + size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, + .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int error; - value_offs = le16_to_cpu(entry->e_value_offs); - value_size = le32_to_cpu(entry->e_value_size); - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); buffer = kmalloc(value_size, GFP_NOFS); @@ -1350,7 +2509,15 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, bs->bh = NULL; /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, entry, buffer, value_size); + if (error) + goto out; + } else { + size_t value_offs = le16_to_cpu(entry->e_value_offs); + memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + } + memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; @@ -1364,11 +2531,10 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, goto out; /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(inode, &i, is); + error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto out; - i.name = b_entry_name; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); @@ -1412,9 +2578,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); + total_size = EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { @@ -1433,8 +2600,10 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, } entry_size = EXT4_XATTR_LEN(entry->e_name_len); - total_size = entry_size + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + total_size = entry_size; + if (!entry->e_value_inum) + total_size += EXT4_XATTR_SIZE( + le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) @@ -1563,51 +2732,172 @@ cleanup: return error; } +#define EIA_INCR 16 /* must be 2^n */ +#define EIA_MASK (EIA_INCR - 1) +/* Add the large xattr @inode into @ea_inode_array for deferred iput(). + * If @ea_inode_array is new or full it will be grown and the old + * contents copied over. + */ +static int +ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, + struct inode *inode) +{ + if (*ea_inode_array == NULL) { + /* + * Start with 15 inodes, so it fits into a power-of-two size. + * If *ea_inode_array is NULL, this is essentially offsetof() + */ + (*ea_inode_array) = + kmalloc(offsetof(struct ext4_xattr_inode_array, + inodes[EIA_MASK]), + GFP_NOFS); + if (*ea_inode_array == NULL) + return -ENOMEM; + (*ea_inode_array)->count = 0; + } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { + /* expand the array once all 15 + n * 16 slots are full */ + struct ext4_xattr_inode_array *new_array = NULL; + int count = (*ea_inode_array)->count; + + /* if new_array is NULL, this is essentially offsetof() */ + new_array = kmalloc( + offsetof(struct ext4_xattr_inode_array, + inodes[count + EIA_INCR]), + GFP_NOFS); + if (new_array == NULL) + return -ENOMEM; + memcpy(new_array, *ea_inode_array, + offsetof(struct ext4_xattr_inode_array, inodes[count])); + kfree(*ea_inode_array); + *ea_inode_array = new_array; + } + (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; + return 0; +} /* * ext4_xattr_delete_inode() * - * Free extended attribute resources associated with this inode. This - * is called immediately before an inode is freed. We have exclusive - * access to the inode. + * Free extended attribute resources associated with this inode. Traverse + * all entries and decrement reference on any xattr inodes associated with this + * inode. This is called immediately before an inode is freed. We have exclusive + * access to the inode. If an orphan inode is deleted it will also release its + * references on xattr block and xattr inodes. */ -void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **ea_inode_array, + int extra_credits) { struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_header *header; + struct ext4_iloc iloc = { .bh = NULL }; + struct ext4_xattr_entry *entry; + int error; - if (!EXT4_I(inode)->i_file_acl) - goto cleanup; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); + error = ext4_xattr_ensure_credits(handle, inode, extra_credits, + NULL /* bh */, + false /* dirty */, + false /* block_csum */); + if (error) { + EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - goto cleanup; + + if (ext4_has_feature_ea_inode(inode->i_sb) && + ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); + goto cleanup; + } + + error = ext4_journal_get_write_access(handle, iloc.bh); + if (error) { + EXT4_ERROR_INODE(inode, "write access (error %d)", + error); + goto cleanup; + } + + header = IHDR(inode, ext4_raw_inode(&iloc)); + if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, + IFIRST(header), + false /* block_csum */, + ea_inode_array, + extra_credits, + false /* skip_quota */); } - ext4_xattr_release_block(handle, inode, bh); - EXT4_I(inode)->i_file_acl = 0; + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + error = ext4_xattr_check_block(inode, bh); + if (error) { + EXT4_ERROR_INODE(inode, "bad block %llu (error %d)", + EXT4_I(inode)->i_file_acl, error); + goto cleanup; + } + + if (ext4_has_feature_ea_inode(inode->i_sb)) { + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) + if (entry->e_value_inum) + ext4_xattr_inode_free_quota(inode, + le32_to_cpu(entry->e_value_size)); + + } + + ext4_xattr_release_block(handle, inode, bh, ea_inode_array, + extra_credits); + /* + * Update i_file_acl value in the same transaction that releases + * block. + */ + EXT4_I(inode)->i_file_acl = 0; + error = ext4_mark_inode_dirty(handle, inode); + if (error) { + EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", + error); + goto cleanup; + } + } + error = 0; cleanup: + brelse(iloc.bh); brelse(bh); + return error; +} + +void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) +{ + int idx; + + if (ea_inode_array == NULL) + return; + + for (idx = 0; idx < ea_inode_array->count; ++idx) + iput(ea_inode_array->inodes[idx]); + kfree(ea_inode_array); } /* - * ext4_xattr_cache_insert() + * ext4_xattr_block_cache_insert() * - * Create a new entry in the extended attribute cache, and insert + * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. * * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) +ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, + struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); @@ -1615,7 +2905,9 @@ ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) EXT4_XATTR_REFCOUNT_MAX; int error; - error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + if (!ea_block_cache) + return; + error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) @@ -1647,11 +2939,11 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || + entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; - if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EFSCORRUPTED; - if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + if (!entry1->e_value_inum && + memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; @@ -1665,7 +2957,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, } /* - * ext4_xattr_cache_find() + * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * @@ -1673,30 +2965,33 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * not found or an error occurred. */ static struct buffer_head * -ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb_cache_entry **pce) +ext4_xattr_block_cache_find(struct inode *inode, + struct ext4_xattr_header *header, + struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + if (!ea_block_cache) + return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); - ce = mb_cache_entry_find_first(ext4_mb_cache, hash); + ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_block); + bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long) ce->e_block); + (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ext4_mb_cache, ce); + ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } @@ -1709,30 +3004,22 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, * * Compute the hash of an extended attribute. */ -static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) +static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, + size_t value_count) { __u32 hash = 0; - char *name = entry->e_name; - int n; - for (n = 0; n < entry->e_name_len; n++) { + while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ *name++; } - - if (entry->e_value_size != 0) { - __le32 *value = (__le32 *)((char *)header + - le16_to_cpu(entry->e_value_offs)); - for (n = (le32_to_cpu(entry->e_value_size) + - EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { - hash = (hash << VALUE_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ - le32_to_cpu(*value++); - } + while (value_count--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); } - entry->e_hash = cpu_to_le32(hash); + return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT @@ -1745,13 +3032,11 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, * * Re-compute the extended attribute hash value after an entry has changed. */ -static void ext4_xattr_rehash(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) +static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; - ext4_xattr_hash_entry(header, entry); here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 099c8b670ef5..0d2dde1fa87a 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -44,7 +44,7 @@ struct ext4_xattr_entry { __u8 e_name_len; /* length of name */ __u8 e_name_index; /* attribute name index */ __le16 e_value_offs; /* offset in disk block of value */ - __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_inum; /* inode in which the value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ char e_name[0]; /* attribute name */ @@ -69,6 +69,13 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) +/* + * The minimum size of EA value when you start storing it in an external inode + * size of block - size of header - size of 1 entry - 4 null bytes +*/ +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ + ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) + #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) #define BFIRST(bh) ENTRY(BHDR(bh)+1) @@ -77,10 +84,11 @@ struct ext4_xattr_entry { #define EXT4_ZERO_XATTR_VALUE ((void *)-1) struct ext4_xattr_info { - int name_index; const char *name; const void *value; size_t value_len; + int name_index; + int in_inode; }; struct ext4_xattr_search { @@ -96,6 +104,11 @@ struct ext4_xattr_ibody_find { struct ext4_iloc iloc; }; +struct ext4_xattr_inode_array { + unsigned int count; /* # of used items in the array */ + struct inode *inodes[0]; +}; + extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; @@ -139,8 +152,16 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len, + bool is_create, int *credits); +extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + struct buffer_head *block_bh, size_t value_len, + bool is_create); -extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_inode_array **array, + int extra_credits); +extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); @@ -169,3 +190,11 @@ static inline int ext4_init_security(handle_t *handle, struct inode *inode, return 0; } #endif + +#ifdef CONFIG_LOCKDEP +extern void ext4_xattr_inode_set_class(struct inode *ea_inode); +#else +static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { } +#endif + +extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 251356859476..87c1f4150c64 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -58,12 +58,12 @@ static void f2fs_read_end_io(struct bio *bio) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } #endif if (f2fs_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -74,7 +74,7 @@ static void f2fs_read_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { if (!PageUptodate(page)) SetPageUptodate(page); } else { @@ -102,14 +102,14 @@ static void f2fs_write_end_io(struct bio *bio) unlock_page(page); mempool_free(page, sbi->write_io_dummy); - if (unlikely(bio->bi_error)) + if (unlikely(bio->bi_status)) f2fs_stop_checkpoint(sbi, true); continue; } fscrypt_pullback_bio_page(&page, true); - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 42c39f0bfd88..94a88b233e98 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1116,6 +1116,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, { SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = sbi->s_chksum_driver; @@ -1125,7 +1126,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, @@ -2161,26 +2164,6 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; -} - #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 527c9f36d971..a0e6d2c65a9e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1063,11 +1063,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3ed2f947f5da..d53fe620939e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2661,17 +2661,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; - nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8, + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4c246e351103..f964b68718c1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -757,7 +757,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - dc->error = bio->bi_error; + dc->error = blk_status_to_errno(bio->bi_status); dc->state = D_DONE; complete_all(&dc->wait); bio_put(bio); @@ -2918,13 +2918,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -2957,7 +2957,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -2990,7 +2990,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); - sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; + sit_i->mounted_time = ktime_get_real_seconds(); mutex_init(&sit_i->sentry_lock); return 0; } @@ -3008,12 +3008,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -3193,7 +3193,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -3215,7 +3215,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e9ba1f1d9723..6b871b492fd5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -716,8 +716,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); - return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - - sit_i->mounted_time; + time64_t now = ktime_get_real_seconds(); + + return sit_i->elapsed_time + now - sit_i->mounted_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dd92170e0d6d..32e4c025e97e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1967,7 +1967,7 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); - memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; diff --git a/fs/fcntl.c b/fs/fcntl.c index 8bd81c2e89b2..3b01b646e528 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -109,20 +109,34 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, } EXPORT_SYMBOL(__f_setown); -void f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, unsigned long arg, int force) { enum pid_type type; - struct pid *pid; - int who = arg; + struct pid *pid = NULL; + int who = arg, ret = 0; + type = PIDTYPE_PID; if (who < 0) { + /* avoid overflow below */ + if (who == INT_MIN) + return -EINVAL; + type = PIDTYPE_PGID; who = -who; } + rcu_read_lock(); - pid = find_vpid(who); - __f_setown(filp, pid, type, force); + if (who) { + pid = find_vpid(who); + if (!pid) + ret = -ESRCH; + } + + if (!ret) + __f_setown(filp, pid, type, force); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL(f_setown); @@ -243,9 +257,72 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif +static bool rw_hint_valid(enum rw_hint hint) +{ + switch (hint) { + case RWF_WRITE_LIFE_NOT_SET: + case RWH_WRITE_LIFE_NONE: + case RWH_WRITE_LIFE_SHORT: + case RWH_WRITE_LIFE_MEDIUM: + case RWH_WRITE_LIFE_LONG: + case RWH_WRITE_LIFE_EXTREME: + return true; + default: + return false; + } +} + +static long fcntl_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 *argp = (u64 __user *)arg; + enum rw_hint hint; + u64 h; + + switch (cmd) { + case F_GET_FILE_RW_HINT: + h = file_write_hint(file); + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_FILE_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + spin_lock(&file->f_lock); + file->f_write_hint = hint; + spin_unlock(&file->f_lock); + return 0; + case F_GET_RW_HINT: + h = inode->i_write_hint; + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + inode_lock(inode); + inode->i_write_hint = hint; + inode_unlock(inode); + return 0; + default: + return -EINVAL; + } +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { + void __user *argp = (void __user *)arg; + struct flock flock; long err = -EINVAL; switch (cmd) { @@ -273,7 +350,11 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_OFD_GETLK: #endif case F_GETLK: - err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); + if (copy_from_user(&flock, argp, sizeof(flock))) + return -EFAULT; + err = fcntl_getlk(filp, cmd, &flock); + if (!err && copy_to_user(argp, &flock, sizeof(flock))) + return -EFAULT; break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -283,7 +364,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, /* Fallthrough */ case F_SETLK: case F_SETLKW: - err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); + if (copy_from_user(&flock, argp, sizeof(flock))) + return -EFAULT; + err = fcntl_setlk(fd, filp, cmd, &flock); break; case F_GETOWN: /* @@ -297,8 +380,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - f_setown(filp, arg, 1); - err = 0; + err = f_setown(filp, arg, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -337,6 +419,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GET_SEALS: err = shmem_fcntl(filp, cmd, arg); break; + case F_GET_RW_HINT: + case F_SET_RW_HINT: + case F_GET_FILE_RW_HINT: + case F_SET_FILE_RW_HINT: + err = fcntl_rw_hint(filp, cmd, arg); + break; default: break; } @@ -383,7 +471,9 @@ out: SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { + void __user *argp = (void __user *)arg; struct fd f = fdget_raw(fd); + struct flock64 flock; long err = -EBADF; if (!f.file) @@ -401,14 +491,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, switch (cmd) { case F_GETLK64: case F_OFD_GETLK: - err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); + err = -EFAULT; + if (copy_from_user(&flock, argp, sizeof(flock))) + break; + err = fcntl_getlk64(f.file, cmd, &flock); + if (!err && copy_to_user(argp, &flock, sizeof(flock))) + err = -EFAULT; break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: - err = fcntl_setlk64(fd, f.file, cmd, - (struct flock64 __user *) arg); + err = -EFAULT; + if (copy_from_user(&flock, argp, sizeof(flock))) + break; + err = fcntl_setlk64(fd, f.file, cmd, &flock); break; default: err = do_fcntl(fd, cmd, arg, f.file); @@ -422,57 +519,56 @@ out: #endif #ifdef CONFIG_COMPAT -static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) -{ - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) +/* careful - don't use anywhere else */ +#define copy_flock_fields(dst, src) \ + (dst)->l_type = (src)->l_type; \ + (dst)->l_whence = (src)->l_whence; \ + (dst)->l_start = (src)->l_start; \ + (dst)->l_len = (src)->l_len; \ + (dst)->l_pid = (src)->l_pid; + +static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl) +{ + struct compat_flock fl; + + if (copy_from_user(&fl, ufl, sizeof(struct compat_flock))) return -EFAULT; + copy_flock_fields(kfl, &fl); return 0; } -static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl) { - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock64 fl; + + if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64))) return -EFAULT; + copy_flock_fields(kfl, &fl); return 0; } -#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 -static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl) { - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock fl; + + memset(&fl, 0, sizeof(struct compat_flock)); + copy_flock_fields(&fl, kfl); + if (copy_to_user(ufl, &fl, sizeof(struct compat_flock))) return -EFAULT; return 0; } -#endif -#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 -static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl) { - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock64 fl; + + memset(&fl, 0, sizeof(struct compat_flock64)); + copy_flock_fields(&fl, kfl); + if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) return -EFAULT; return 0; } -#endif +#undef copy_flock_fields static unsigned int convert_fcntl_cmd(unsigned int cmd) @@ -489,76 +585,92 @@ convert_fcntl_cmd(unsigned int cmd) return cmd; } +/* + * GETLK was successful and we need to return the data, but it needs to fit in + * the compat structure. + * l_start shouldn't be too big, unless the original start + end is greater than + * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return + * -EOVERFLOW in that case. l_len could be too big, in which case we just + * truncate it, and only allow the app to see that part of the conflicting lock + * that might make sense to it anyway + */ +static int fixup_compat_flock(struct flock *flock) +{ + if (flock->l_start > COMPAT_OFF_T_MAX) + return -EOVERFLOW; + if (flock->l_len > COMPAT_OFF_T_MAX) + flock->l_len = COMPAT_OFF_T_MAX; + return 0; +} + COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { - mm_segment_t old_fs; - struct flock f; - long ret; - unsigned int conv_cmd; + struct fd f = fdget_raw(fd); + struct flock flock; + long err = -EBADF; + + if (!f.file) + return err; + + if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) + goto out_put; + } + + err = security_file_fcntl(f.file, cmd, arg); + if (err) + goto out_put; switch (cmd) { case F_GETLK: + err = get_compat_flock(&flock, compat_ptr(arg)); + if (err) + break; + err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + if (err) + break; + err = fixup_compat_flock(&flock); + if (err) + return err; + err = put_compat_flock(&flock, compat_ptr(arg)); + break; + case F_GETLK64: + case F_OFD_GETLK: + err = get_compat_flock64(&flock, compat_ptr(arg)); + if (err) + break; + err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + if (err) + break; + err = fixup_compat_flock(&flock); + if (err) + return err; + err = put_compat_flock64(&flock, compat_ptr(arg)); + break; case F_SETLK: case F_SETLKW: - ret = get_compat_flock(&f, compat_ptr(arg)); - if (ret != 0) + err = get_compat_flock(&flock, compat_ptr(arg)); + if (err) break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_fcntl(fd, cmd, (unsigned long)&f); - set_fs(old_fs); - if (cmd == F_GETLK && ret == 0) { - /* GETLK was successful and we need to return the data... - * but it needs to fit in the compat structure. - * l_start shouldn't be too big, unless the original - * start + end is greater than COMPAT_OFF_T_MAX, in which - * case the app was asking for trouble, so we return - * -EOVERFLOW in that case. - * l_len could be too big, in which case we just truncate it, - * and only allow the app to see that part of the conflicting - * lock that might make sense to it anyway - */ - - if (f.l_start > COMPAT_OFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_OFF_T_MAX) - f.l_len = COMPAT_OFF_T_MAX; - if (ret == 0) - ret = put_compat_flock(&f, compat_ptr(arg)); - } + err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); break; - - case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: - ret = get_compat_flock64(&f, compat_ptr(arg)); - if (ret != 0) + err = get_compat_flock64(&flock, compat_ptr(arg)); + if (err) break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - conv_cmd = convert_fcntl_cmd(cmd); - ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); - set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { - /* need to return lock information - see above for commentary */ - if (f.l_start > COMPAT_LOFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_LOFF_T_MAX) - f.l_len = COMPAT_LOFF_T_MAX; - if (ret == 0) - ret = put_compat_flock64(&f, compat_ptr(arg)); - } + err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); break; - default: - ret = sys_fcntl(fd, cmd, arg); + err = do_fcntl(fd, cmd, arg, f.file); break; } - return ret; +out_put: + fdput(f); + return err; } COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, @@ -899,16 +1011,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( - O_RDONLY | O_WRONLY | O_RDWR | - O_CREAT | O_EXCL | O_NOCTTY | - O_TRUNC | O_APPEND | /* O_NONBLOCK | */ - __O_SYNC | O_DSYNC | FASYNC | - O_DIRECT | O_LARGEFILE | O_DIRECTORY | - O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH | __O_TMPFILE | - __FMODE_NONOTIFY - )); + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != + HWEIGHT32( + (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | + __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); diff --git a/fs/file.c b/fs/file.c index ad6f094f2eff..1fc7fbbb4510 100644 --- a/fs/file.c +++ b/fs/file.c @@ -30,21 +30,6 @@ unsigned int sysctl_nr_open_min = BITS_PER_LONG; unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; -static void *alloc_fdmem(size_t size) -{ - /* - * Very large allocations can stress page reclaim, so fall back to - * vmalloc() if the allocation size will be considered "large" by the VM. - */ - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | - __GFP_NOWARN | __GFP_NORETRY); - if (data != NULL) - return data; - } - return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); -} - static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); @@ -131,13 +116,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (!fdt) goto out; fdt->max_fds = nr; - data = alloc_fdmem(nr * sizeof(struct file *)); + data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; - data = alloc_fdmem(max_t(size_t, - 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES)); + data = kvmalloc(max_t(size_t, + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), + GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; diff --git a/fs/file_table.c b/fs/file_table.c index 954d510b765a..72e861a35a7f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -168,6 +168,7 @@ struct file *alloc_file(const struct path *path, fmode_t mode, file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; + file->f_wb_err = filemap_sample_wb_err(file->f_mapping); if ((mode & FMODE_READ) && likely(fop->read || fop->read_iter)) mode |= FMODE_CAN_READ; diff --git a/fs/filesystems.c b/fs/filesystems.c index cac75547d35c..8b99955e3504 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -275,8 +275,10 @@ struct file_system_type *get_fs_type(const char *name) int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); - if (!fs && (request_module("fs-%.*s", len, name) == 0)) + if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); + WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name); + } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 63ee2940775c..8b426f83909f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2052,11 +2052,13 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) } /** - * __mark_inode_dirty - internal function - * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. + * __mark_inode_dirty - internal function + * + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. * * Put the inode on the super block's dirty list. * diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 611b5408f6ec..e747b3d720ee 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) void pin_kill(struct fs_pin *p) { - wait_queue_t wait; + wait_queue_entry_t wait; if (!p) { rcu_read_unlock(); @@ -61,7 +61,7 @@ void pin_kill(struct fs_pin *p) rcu_read_unlock(); schedule(); rcu_read_lock(); - if (likely(list_empty(&wait.task_list))) + if (likely(list_empty(&wait.entry))) break; /* OK, we know p couldn't have been freed yet */ spin_lock_irq(&p->wait.lock); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 6e22748b0704..b9ea99c5b5b3 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -292,7 +292,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) { - struct tree_descr empty_descr = {""}; + static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; int err; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c2d7f3a92679..c16d00e53264 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -20,6 +20,7 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> +#include <linux/sched.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -45,7 +46,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); - atomic_set(&req->count, 1); + refcount_set(&req->count, 1); req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; @@ -102,21 +103,20 @@ void fuse_request_free(struct fuse_req *req) void __fuse_get_request(struct fuse_req *req) { - atomic_inc(&req->count); + refcount_inc(&req->count); } /* Must be called with > 1 refcount */ static void __fuse_put_request(struct fuse_req *req) { - BUG_ON(atomic_read(&req->count) < 2); - atomic_dec(&req->count); + refcount_dec(&req->count); } -static void fuse_req_init_context(struct fuse_req *req) +static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req) { req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); - req->in.h.pid = current->pid; + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } void fuse_set_initialized(struct fuse_conn *fc) @@ -163,7 +163,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, goto out; } - fuse_req_init_context(req); + fuse_req_init_context(fc, req); __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); @@ -256,7 +256,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, if (!req) req = get_reserved_req(fc, file); - fuse_req_init_context(req); + fuse_req_init_context(fc, req); __set_bit(FR_WAITING, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); return req; @@ -264,7 +264,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { - if (atomic_dec_and_test(&req->count)) { + if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { /* * We get here in the unlikely case that a background @@ -1222,6 +1222,9 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_in *in; unsigned reqsize; + if (task_active_pid_ns(current) != fc->pid_ns) + return -EIO; + restart: spin_lock(&fiq->waitq.lock); err = -EAGAIN; @@ -1820,6 +1823,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; + if (task_active_pid_ns(current) != fc->pid_ns) + return -EIO; + if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ec238fb5a584..3ee4fdc3da9e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) } INIT_LIST_HEAD(&ff->write_entry); - atomic_set(&ff->count, 1); + refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -77,7 +77,7 @@ void fuse_file_free(struct fuse_file *ff) static struct fuse_file *fuse_file_get(struct fuse_file *ff) { - atomic_inc(&ff->count); + refcount_inc(&ff->count); return ff; } @@ -88,7 +88,7 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) static void fuse_file_put(struct fuse_file *ff, bool sync) { - if (atomic_dec_and_test(&ff->count)) { + if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; if (ff->fc->no_open) { @@ -293,7 +293,7 @@ static int fuse_release(struct inode *inode, struct file *file) void fuse_sync_release(struct fuse_file *ff, int flags) { - WARN_ON(atomic_read(&ff->count) != 1); + WARN_ON(refcount_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); /* * iput(NULL) is a no-op and since the refcount is 1 and everything's @@ -2083,7 +2083,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } -static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, +static int convert_fuse_file_lock(struct fuse_conn *fc, + const struct fuse_file_lock *ffl, struct file_lock *fl) { switch (ffl->type) { @@ -2098,7 +2099,14 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, fl->fl_start = ffl->start; fl->fl_end = ffl->end; - fl->fl_pid = ffl->pid; + + /* + * Convert pid into the caller's pid namespace. If the pid + * does not map into the namespace fl_pid will get set to 0. + */ + rcu_read_lock(); + fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns)); + rcu_read_unlock(); break; default: @@ -2147,7 +2155,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out.args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) - err = convert_fuse_file_lock(&outarg.lk, fl); + err = convert_fuse_file_lock(fc, &outarg.lk, fl); return err; } @@ -2159,7 +2167,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) FUSE_ARGS(args); struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; - pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; + struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; + pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { @@ -2168,10 +2177,13 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) } /* Unlock on close is handled by the flush method */ - if (fl->fl_flags & FL_CLOSE) + if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; - fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg); + if (pid && pid_nr == 0) + return -EOVERFLOW; + + fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fc, &args); /* locking is restartable */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f33341d9501a..1bd7ffdad593 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -24,6 +24,8 @@ #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/xattr.h> +#include <linux/pid_namespace.h> +#include <linux/refcount.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -137,7 +139,7 @@ struct fuse_file { u64 nodeid; /** Refcount */ - atomic_t count; + refcount_t count; /** FOPEN_* flags returned by open */ u32 open_flags; @@ -306,7 +308,7 @@ struct fuse_req { struct list_head intr_entry; /** refcount */ - atomic_t count; + refcount_t count; /** Unique ID for the interrupt request */ u64 intr_unique; @@ -448,7 +450,7 @@ struct fuse_conn { spinlock_t lock; /** Refcount */ - atomic_t count; + refcount_t count; /** Number of fuse_dev's */ atomic_t dev_count; @@ -461,6 +463,9 @@ struct fuse_conn { /** The group id for this mount */ kgid_t group_id; + /** The pid namespace for this mount */ + struct pid_namespace *pid_ns; + /** Maximum read size */ unsigned max_read; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73cf05135252..65c88379a3a1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> +#include <linux/pid_namespace.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -601,7 +602,7 @@ void fuse_conn_init(struct fuse_conn *fc) memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); init_rwsem(&fc->killsb); - atomic_set(&fc->count, 1); + refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); @@ -619,14 +620,16 @@ void fuse_conn_init(struct fuse_conn *fc) fc->connected = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { - if (atomic_dec_and_test(&fc->count)) { + if (refcount_dec_and_test(&fc->count)) { if (fc->destroy_req) fuse_request_free(fc->destroy_req); + put_pid_ns(fc->pid_ns); fc->release(fc); } } @@ -634,7 +637,7 @@ EXPORT_SYMBOL_GPL(fuse_conn_put); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) { - atomic_inc(&fc->count); + refcount_inc(&fc->count); return fc; } EXPORT_SYMBOL_GPL(fuse_conn_get); @@ -972,8 +975,15 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) int err; char *suffix = ""; - if (sb->s_bdev) + if (sb->s_bdev) { suffix = "-fuseblk"; + /* + * sb->s_bdi points to blkdev's bdi however we want to redirect + * it to our private bdi... + */ + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + } err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev), MINOR(fc->dev), suffix); if (err) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3814a60e0aea..9fa3aef9a5b3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -970,7 +970,7 @@ more_rgrps: continue; bn = be64_to_cpu(*p); if (gfs2_holder_initialized(rd_gh)) { - rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object; + rgd = gfs2_glock2rgrp(rd_gh->gh_gl); gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); } else { @@ -1072,7 +1072,7 @@ out_unlock: /* Every transaction boundary, we rewrite the dinode to keep its di_blocks current in case of failure. */ ip->i_inode.i_mtime = ip->i_inode.i_ctime = - CURRENT_TIME; + current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); up_write(&ip->i_rw_mutex); @@ -1293,7 +1293,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) gfs2_statfs_change(sdp, 0, +btotal, 0); gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, ip->i_inode.i_gid); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); up_write(&ip->i_rw_mutex); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 79113219be5f..db427658ccd9 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1444,7 +1444,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, "g.offset (%u)\n", (unsigned long long)bh->b_blocknr, entries2, g.offset); - + gfs2_consist_inode(ip); error = -EIO; goto out_free; } @@ -1612,6 +1612,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, (unsigned long long)dip->i_no_addr, dip->i_entries, g.offset); + gfs2_consist_inode(dip); error = -EIO; goto out; } @@ -2031,8 +2032,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); + rg_blocks += rgd->rd_length; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 959a19ced4d5..c38ab6c81898 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -80,9 +80,9 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; -void gfs2_glock_free(struct gfs2_glock *gl) +static void gfs2_glock_dealloc(struct rcu_head *rcu) { - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); @@ -90,6 +90,13 @@ void gfs2_glock_free(struct gfs2_glock *gl) kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(gfs2_glock_cachep, gl); } +} + +void gfs2_glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); } @@ -152,20 +159,34 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) spin_unlock(&lru_lock); } -/** - * gfs2_glock_put() - Decrement reference count on glock - * @gl: The glock to put - * +/* + * Enqueue the glock on the work queue. Passes one glock reference on to the + * work queue. */ +static void __gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { + if (!queue_delayed_work(glock_workqueue, &gl->gl_work, delay)) { + /* + * We are holding the lockref spinlock, and the work was still + * queued above. The queued work (glock_work_func) takes that + * spinlock before dropping its glock reference(s), so it + * cannot have dropped them in the meantime. + */ + GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2); + gl->gl_lockref.count--; + } +} -void gfs2_glock_put(struct gfs2_glock *gl) +static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { + spin_lock(&gl->gl_lockref.lock); + __gfs2_glock_queue_work(gl, delay); + spin_unlock(&gl->gl_lockref.lock); +} + +static void __gfs2_glock_put(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); - if (lockref_put_or_lock(&gl->gl_lockref)) - return; - lockref_mark_dead(&gl->gl_lockref); gfs2_glock_remove_from_lru(gl); @@ -178,6 +199,20 @@ void gfs2_glock_put(struct gfs2_glock *gl) } /** + * gfs2_glock_put() - Decrement reference count on glock + * @gl: The glock to put + * + */ + +void gfs2_glock_put(struct gfs2_glock *gl) +{ + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + __gfs2_glock_put(gl); +} + +/** * may_grant - check if its ok to grant a new lock * @gl: The glock * @gh: The lock request which we wish to grant @@ -482,8 +517,7 @@ __acquires(&gl->gl_lockref.lock) target == LM_ST_UNLOCKED && test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) { finish_xmote(gl, target); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_queue_work(gl, 0); } else if (ret) { pr_err("lm_lock ret %d\n", ret); @@ -492,8 +526,7 @@ __acquires(&gl->gl_lockref.lock) } } else { /* lock_nolock */ finish_xmote(gl, target); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_queue_work(gl, 0); } spin_lock(&gl->gl_lockref.lock); @@ -565,8 +598,7 @@ out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_atomic(); gl->gl_lockref.count++; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); return; out_unlock: @@ -601,11 +633,11 @@ static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); - int drop_ref = 0; + unsigned int drop_refs = 1; if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); - drop_ref = 1; + drop_refs++; } spin_lock(&gl->gl_lockref.lock); if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && @@ -623,17 +655,25 @@ static void glock_work_func(struct work_struct *work) } } run_queue(gl, 0); - spin_unlock(&gl->gl_lockref.lock); - if (!delay) - gfs2_glock_put(gl); - else { + if (delay) { + /* Keep one glock reference for the work we requeue. */ + drop_refs--; if (gl->gl_name.ln_type != LM_TYPE_INODE) delay = 0; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); + __gfs2_glock_queue_work(gl, delay); } - if (drop_ref) - gfs2_glock_put(gl); + + /* + * Drop the remaining glock references manually here. (Mind that + * __gfs2_glock_queue_work depends on the lockref spinlock begin held + * here as well.) + */ + gl->gl_lockref.count -= drop_refs; + if (!gl->gl_lockref.count) { + __gfs2_glock_put(gl); + return; + } + spin_unlock(&gl->gl_lockref.lock); } /** @@ -986,8 +1026,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gl->gl_lockref.count++; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); } run_queue(gl, 1); spin_unlock(&gl->gl_lockref.lock); @@ -1047,17 +1086,15 @@ void gfs2_glock_dq(struct gfs2_holder *gh) gfs2_glock_add_to_lru(gl); trace_gfs2_glock_queue(gh, 0); + if (unlikely(!fast_path)) { + gl->gl_lockref.count++; + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) + delay = gl->gl_hold_time; + __gfs2_glock_queue_work(gl, delay); + } spin_unlock(&gl->gl_lockref.lock); - if (likely(fast_path)) - return; - - gfs2_glock_hold(gl); - if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags) && - gl->gl_name.ln_type == LM_TYPE_INODE) - delay = gl->gl_hold_time; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); } void gfs2_glock_dq_wait(struct gfs2_holder *gh) @@ -1233,9 +1270,8 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) spin_lock(&gl->gl_lockref.lock); handle_callback(gl, state, delay, true); + __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); } /** @@ -1294,10 +1330,8 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gl->gl_lockref.count++; set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); - - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); } static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1355,8 +1389,7 @@ add_back_to_lru: if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); cond_resched_lock(&lru_lock); } @@ -1462,13 +1495,12 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) static void thaw_glock(struct gfs2_glock *gl) { - if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) - goto out; - set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) { -out: + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) { gfs2_glock_put(gl); + return; } + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gfs2_glock_queue_work(gl, 0); } /** @@ -1484,9 +1516,8 @@ static void clear_glock(struct gfs2_glock *gl) spin_lock(&gl->gl_lockref.lock); if (gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED, 0, false); + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); } /** diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index ab1ef322f7a5..9ad4a6ac6c84 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -257,4 +257,11 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) return gh->gh_gl; } +static inline void glock_set_object(struct gfs2_glock *gl, void *object) +{ + spin_lock(&gl->gl_lockref.lock); + gl->gl_object = object; + spin_unlock(&gl->gl_lockref.lock); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5db59d444838..5e69636d4dd3 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -137,7 +137,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) * * Called when demoting or unlocking an EX glock. We must flush * to disk all dirty buffers/pages relating to this glock, and must not - * not return to caller to demote/unlock the glock until I/O is complete. + * return to caller to demote/unlock the glock until I/O is complete. */ static void rgrp_go_sync(struct gfs2_glock *gl) @@ -184,7 +184,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; - struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); if (rgd) gfs2_rgrp_brelse(rgd); @@ -197,6 +197,38 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) rgd->rd_flags &= ~GFS2_RDF_UPTODATE; } +static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip) + set_bit(GIF_GLOP_PENDING, &ip->i_flags); + spin_unlock(&gl->gl_lockref.lock); + return ip; +} + +struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl) +{ + struct gfs2_rgrpd *rgd; + + spin_lock(&gl->gl_lockref.lock); + rgd = gl->gl_object; + spin_unlock(&gl->gl_lockref.lock); + + return rgd; +} + +static void gfs2_clear_glop_pending(struct gfs2_inode *ip) +{ + if (!ip) + return; + + clear_bit_unlock(GIF_GLOP_PENDING, &ip->i_flags); + wake_up_bit(&ip->i_flags, GIF_GLOP_PENDING); +} + /** * inode_go_sync - Sync the dirty data and/or metadata for an inode glock * @gl: the glock protecting the inode @@ -205,25 +237,24 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) static void inode_go_sync(struct gfs2_glock *gl) { - struct gfs2_inode *ip = gl->gl_object; + struct gfs2_inode *ip = gfs2_glock2inode(gl); + int isreg = ip && S_ISREG(ip->i_inode.i_mode); struct address_space *metamapping = gfs2_glock2aspace(gl); int error; - if (ip && !S_ISREG(ip->i_inode.i_mode)) - ip = NULL; - if (ip) { + if (isreg) { if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); inode_dio_wait(&ip->i_inode); } if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) - return; + goto out; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH); filemap_fdatawrite(metamapping); - if (ip) { + if (isreg) { struct address_space *mapping = ip->i_inode.i_mapping; filemap_fdatawrite(mapping); error = filemap_fdatawait(mapping); @@ -238,6 +269,9 @@ static void inode_go_sync(struct gfs2_glock *gl) */ smp_mb__before_atomic(); clear_bit(GLF_DIRTY, &gl->gl_flags); + +out: + gfs2_clear_glop_pending(ip); } /** @@ -253,7 +287,7 @@ static void inode_go_sync(struct gfs2_glock *gl) static void inode_go_inval(struct gfs2_glock *gl, int flags) { - struct gfs2_inode *ip = gl->gl_object; + struct gfs2_inode *ip = gfs2_glock2inode(gl); gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count)); @@ -274,6 +308,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); + + gfs2_clear_glop_pending(ip); } /** @@ -541,7 +577,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl) */ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { - struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + struct gfs2_inode *ip = gl->gl_object; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b7cf65d13561..73fce76e67ee 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -336,7 +336,6 @@ enum { }; struct gfs2_glock { - struct hlist_bl_node gl_list; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; @@ -374,6 +373,7 @@ struct gfs2_glock { loff_t end; } gl_vm; }; + struct rcu_head gl_rcu; struct rhash_head gl_node; }; @@ -386,6 +386,7 @@ enum { GIF_SW_PAGED = 3, GIF_ORDERED = 4, GIF_FREE_VFS_INODE = 5, + GIF_GLOP_PENDING = 6, }; struct gfs2_inode { @@ -815,13 +816,11 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; - int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; unsigned int sd_log_flush_head; - u64 sd_log_flush_wrapped; spinlock_t sd_ail_lock; struct list_head sd_ail1_list; @@ -858,5 +857,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) preempt_enable(); } +extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); + #endif /* __INCORE_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9f605ea4810c..acca501f8110 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -144,7 +144,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) goto fail; - ip->i_gl->gl_object = ip; + flush_delayed_work(&ip->i_gl->gl_work); + glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) @@ -173,8 +174,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_put; - - ip->i_iopen_gh.gh_gl->gl_object = ip; + flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work); + glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -201,14 +202,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - ip->i_iopen_gh.gh_gl->gl_object = NULL; + glock_set_object(ip->i_iopen_gh.gh_gl, NULL); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - ip->i_gl->gl_object = NULL; + glock_set_object(ip->i_gl, NULL); fail: iget_failed(inode); return ERR_PTR(error); @@ -607,6 +608,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; + gfs2_holder_mark_uninitialized(ghs + 1); error = create_ok(dip, name, mode); if (error) @@ -705,7 +707,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; - ip->i_gl->gl_object = ip; + glock_set_object(ip->i_gl, ip); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; @@ -731,7 +733,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; - ip->i_iopen_gh.gh_gl->gl_object = ip; + glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -778,7 +780,6 @@ fail_gunlock3: fail_gunlock2: if (io_gl) clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); - gfs2_glock_dq_uninit(ghs + 1); fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); @@ -799,6 +800,8 @@ fail_gunlock: &GFS2_I(inode)->i_flags); iput(inode); } + if (gfs2_holder_initialized(ghs + 1)) + gfs2_glock_dq_uninit(ghs + 1); fail: return error; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f865b96374df..9a624f694400 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -659,7 +659,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) struct gfs2_log_header *lh; unsigned int tail; u32 hash; - int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META; + int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); @@ -722,7 +722,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; tr = sdp->sd_log_tr; if (tr) { sdp->sd_log_tr = NULL; @@ -775,7 +774,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, } atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); - sdp->sd_log_flush_wrapped = 0; log_write_header(sdp, 0); sdp->sd_log_head = sdp->sd_log_flush_head; } @@ -880,7 +878,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index b1f9144b42c7..3010f9edd177 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -71,7 +71,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) { struct gfs2_glock *gl = bd->bd_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; struct gfs2_bitmap *bi = rgd->rd_bits + index; @@ -134,10 +134,8 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp) BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && (sdp->sd_log_flush_head != sdp->sd_log_head)); - if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) sdp->sd_log_flush_head = 0; - sdp->sd_log_flush_wrapped = 1; - } } static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) @@ -170,7 +168,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) */ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, - int error) + blk_status_t error) { struct buffer_head *bh, *next; struct page *page = bvec->bv_page; @@ -182,7 +180,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, bh = bh->b_this_page; do { if (error) - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); unlock_buffer(bh); next = bh->b_this_page; size -= bh->b_size; @@ -209,15 +207,13 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_error) { - sdp->sd_log_error = bio->bi_error; - fs_err(sdp, "Error %d writing to log\n", bio->bi_error); - } + if (bio->bi_status) + fs_err(sdp, "Error %d writing to log\n", bio->bi_status); bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_error); + gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); else mempool_free(page, gfs2_page_pool); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 67d1fc4668f7..0a89e6f7a314 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -52,7 +52,6 @@ static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - INIT_HLIST_BL_NODE(&gl->gl_list); spin_lock_init(&gl->gl_lockref.lock); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 663ffc135ef3..fabe1614f879 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -201,7 +201,7 @@ static void gfs2_meta_read_endio(struct bio *bio) do { struct buffer_head *next = bh->b_this_page; len -= bh->b_size; - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bh = next; } while (bh && len); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ed67548b286c..e76058d34b74 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -176,10 +176,10 @@ static void end_bio_io_page(struct bio *bio) { struct page *page = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) SetPageUptodate(page); else - pr_warn("error %d reading superblock\n", bio->bi_error); + pr_warn("error %d reading superblock\n", bio->bi_status); unlock_page(page); } @@ -203,7 +203,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); - memcpy(s->s_uuid, str->sb_uuid, 16); + memcpy(&s->s_uuid, str->sb_uuid, 16); } /** diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 83c9909ff14a..836e38ba5d0a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -705,9 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - spin_lock(&gl->gl_lockref.lock); - gl->gl_object = NULL; - spin_unlock(&gl->gl_lockref.lock); + glock_set_object(gl, NULL); gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } @@ -917,7 +915,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) error = rgd_insert(rgd); spin_unlock(&sdp->sd_rindex_spin); if (!error) { - rgd->rd_gl->gl_object = rgd; + glock_set_object(rgd->rd_gl, rgd); rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 29b0473f6e74..fdedec379b78 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1105,9 +1105,12 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host gfs2_holder_uninit(gh); error = err; } else { - if (!error) - error = statfs_slow_fill( - gh->gh_gl->gl_object, sc); + if (!error) { + struct gfs2_rgrpd *rgd = + gfs2_glock2rgrp(gh->gh_gl); + + error = statfs_slow_fill(rgd, sc); + } gfs2_glock_dq_uninit(gh); } } @@ -1535,6 +1538,12 @@ static void gfs2_evict_inode(struct inode *inode) if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) goto out; + if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); + gfs2_holder_mark_uninitialized(&gh); + goto alloc_failed; + } + /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { @@ -1543,11 +1552,9 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } - if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) - goto out_truncate; - } + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; if (test_bit(GIF_INVALID, &ip->i_flags)) { error = gfs2_inode_refresh(ip); @@ -1555,6 +1562,7 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } +alloc_failed: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; @@ -1621,7 +1629,8 @@ out_unlock: } gfs2_holder_uninit(&ip->i_iopen_gh); } - gfs2_glock_dq_uninit(&gh); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: @@ -1631,13 +1640,13 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - ip->i_gl->gl_object = NULL; - flush_delayed_work(&ip->i_gl->gl_work); + glock_set_object(ip->i_gl, NULL); + wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - ip->i_iopen_gh.gh_gl->gl_object = NULL; + glock_set_object(ip->i_iopen_gh.gh_gl, NULL); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 7a515345610c..ca1f97ff898c 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -71,25 +71,14 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); } -static int gfs2_uuid_valid(const u8 *uuid) -{ - int i; - - for (i = 0; i < 16; i++) { - if (uuid[i]) - return 1; - } - return 0; -} - static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) { struct super_block *s = sdp->sd_vfs; - const u8 *uuid = s->s_uuid; + buf[0] = '\0'; - if (!gfs2_uuid_valid(uuid)) + if (uuid_is_null(&s->s_uuid)) return 0; - return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid); + return snprintf(buf, PAGE_SIZE, "%pUB\n", &s->s_uuid); } static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) @@ -637,12 +626,12 @@ static struct attribute *tune_attrs[] = { NULL, }; -static struct attribute_group tune_group = { +static const struct attribute_group tune_group = { .name = "tune", .attrs = tune_attrs, }; -static struct attribute_group lock_module_group = { +static const struct attribute_group lock_module_group = { .name = "lock_module", .attrs = lock_module_attrs, }; @@ -712,14 +701,13 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); struct super_block *s = sdp->sd_vfs; - const u8 *uuid = s->s_uuid; add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); - if (gfs2_uuid_valid(uuid)) - add_uevent_var(env, "UUID=%pUB", uuid); + if (!uuid_is_null(&s->s_uuid)) + add_uevent_var(env, "UUID=%pUB", &s->s_uuid); return 0; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index d87721aeb575..54179554c7d2 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1327,8 +1327,8 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); + rg_blocks += rgd->rd_length; } diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index e33a0d36a93e..5d0182654580 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -485,8 +485,8 @@ void hfs_file_truncate(struct inode *inode) /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; - res = pagecache_write_begin(NULL, mapping, size+1, 0, - AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + res = pagecache_write_begin(NULL, mapping, size+1, 0, 0, + &page, &fsdata); if (!res) { res = pagecache_write_end(NULL, mapping, size+1, 0, 0, page, fsdata); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index feca524ce2a5..a3eb640b4f8f 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -545,9 +545,8 @@ void hfsplus_file_truncate(struct inode *inode) void *fsdata; loff_t size = inode->i_size; - res = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + res = pagecache_write_begin(NULL, mapping, size, 0, 0, + &page, &fsdata); if (res) return; res = pagecache_write_end(NULL, mapping, size, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dde861387a40..d44f5456eb9b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -200,7 +200,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/fs/inode.c b/fs/inode.c index 131b2bcebc48..50370599e371 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -119,7 +119,7 @@ static int no_open(struct inode *inode, struct file *file) } /** - * inode_init_always - perform inode structure intialisation + * inode_init_always - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise * @@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; + inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; @@ -402,6 +403,8 @@ static void inode_lru_list_add(struct inode *inode) { if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); + else + inode->i_state |= I_REFERENCED; } /* @@ -1489,7 +1492,6 @@ static void iput_final(struct inode *inode) drop = generic_drop_inode(inode); if (!drop && (sb->s_flags & MS_ACTIVE)) { - inode->i_state |= I_REFERENCED; inode_add_lru(inode); spin_unlock(&inode->i_lock); return; @@ -1890,11 +1892,11 @@ static void __wait_on_freeing_inode(struct inode *inode) wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); spin_lock(&inode_hash_lock); } @@ -1913,8 +1915,6 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -1926,20 +1926,15 @@ void __init inode_init_early(void) sizeof(struct hlist_head), ihash_entries, 14, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - unsigned int loop; - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), @@ -1957,14 +1952,11 @@ void __init inode_init(void) sizeof(struct hlist_head), ihash_entries, 14, - 0, + HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) @@ -2022,7 +2014,7 @@ bool inode_owner_or_capable(const struct inode *inode) return true; ns = current_user_ns(); - if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) + if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } @@ -2037,11 +2029,11 @@ static void __inode_dio_wait(struct inode *inode) DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); do { - prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); if (atomic_read(&inode->i_dio_count)) schedule(); } while (atomic_read(&inode->i_dio_count)); - finish_wait(wq, &q.wait); + finish_wait(wq, &q.wq_entry); } /** diff --git a/fs/internal.h b/fs/internal.h index 076751d90ba2..9676fe11c093 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,8 +126,6 @@ static inline bool atime_needs_update_rcu(const struct path *path, return __atime_needs_update(path, inode, true); } -extern bool atime_needs_update_rcu(const struct path *, struct inode *); - /* * fs-writeback.c */ diff --git a/fs/iomap.c b/fs/iomap.c index 1faabe09b8fd..173222863aca 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -158,12 +158,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ssize_t written = 0; unsigned int flags = AOP_FLAG_NOFS; - /* - * Copies from kernel address space cannot fail (NFSD is a big user). - */ - if (!iter_is_iovec(i)) - flags |= AOP_FLAG_UNINTERRUPTIBLE; - do { struct page *page; unsigned long offset; /* Offset into pagecache page */ @@ -291,8 +285,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, return PTR_ERR(rpage); status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, - &page, iomap); + AOP_FLAG_NOFS, &page, iomap); put_page(rpage); if (unlikely(status)) return status; @@ -343,8 +336,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, struct page *page; int status; - status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); + status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, + iomap); if (status) return status; @@ -591,6 +584,100 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } EXPORT_SYMBOL_GPL(iomap_fiemap); +static loff_t +iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, + void *data, struct iomap *iomap) +{ + switch (iomap->type) { + case IOMAP_UNWRITTEN: + offset = page_cache_seek_hole_data(inode, offset, length, + SEEK_HOLE); + if (offset < 0) + return length; + /* fall through */ + case IOMAP_HOLE: + *(loff_t *)data = offset; + return 0; + default: + return length; + } +} + +loff_t +iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +{ + loff_t size = i_size_read(inode); + loff_t length = size - offset; + loff_t ret; + + /* Nothing to be found beyond the end of the file. */ + if (offset >= size) + return -ENXIO; + + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, + &offset, iomap_seek_hole_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + offset += ret; + length -= ret; + } + + return offset; +} +EXPORT_SYMBOL_GPL(iomap_seek_hole); + +static loff_t +iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, + void *data, struct iomap *iomap) +{ + switch (iomap->type) { + case IOMAP_HOLE: + return length; + case IOMAP_UNWRITTEN: + offset = page_cache_seek_hole_data(inode, offset, length, + SEEK_DATA); + if (offset < 0) + return length; + /*FALLTHRU*/ + default: + *(loff_t *)data = offset; + return 0; + } +} + +loff_t +iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +{ + loff_t size = i_size_read(inode); + loff_t length = size - offset; + loff_t ret; + + /* Nothing to be found beyond the end of the file. */ + if (offset >= size) + return -ENXIO; + + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops, + &offset, iomap_seek_data_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + offset += ret; + length -= ret; + } + + if (length <= 0) + return -ENXIO; + return offset; +} +EXPORT_SYMBOL_GPL(iomap_seek_data); + /* * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: @@ -679,8 +766,8 @@ static void iomap_dio_bio_end_io(struct bio *bio) struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - if (bio->bi_error) - iomap_dio_set_error(dio, bio->bi_error); + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); if (atomic_dec_and_test(&dio->ref)) { if (is_sync_kiocb(dio->iocb)) { @@ -800,6 +887,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_bdev = iomap->bdev; bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -888,6 +976,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, flags |= IOMAP_WRITE; } + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, start, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + flags |= IOMAP_NOWAIT; + } + ret = filemap_write_and_wait_range(mapping, start, end); if (ret) goto out_free_dio; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b6b194ec1b4f..3c1c31321d9b 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -263,18 +263,10 @@ static int journal_finish_inode_data_buffers(journal_t *journal, continue; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); - if (err) { - /* - * Because AS_EIO is cleared by - * filemap_fdatawait_range(), set it again so - * that user process can get -EIO from fsync(). - */ - mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO); - - if (!ret) - ret = err; - } + err = filemap_fdatawait_keep_errors( + jinode->i_vfs_inode->i_mapping); + if (!ret) + ret = err; spin_lock(&journal->j_list_lock); jinode->i_flags &= ~JI_COMMIT_RUNNING; smp_mb(); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5a0245e36240..7d5ef3bf3f3e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2363,7 +2363,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, + SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { @@ -2579,10 +2579,10 @@ restart: wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&journal->j_list_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); goto restart; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 9ee4832b6f8b..8b08044b3120 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -409,25 +409,6 @@ static handle_t *new_handle(int nblocks) return handle; } -/** - * handle_t *jbd2_journal_start() - Obtain a new handle. - * @journal: Journal to start transaction on. - * @nblocks: number of block buffer we might modify - * - * We make sure that the transaction can guarantee at least nblocks of - * modified buffers in the log. We block until the log can guarantee - * that much space. Additionally, if rsv_blocks > 0, we also create another - * handle with rsv_blocks reserved blocks in the journal. This handle is - * is stored in h_rsv_handle. It is not attached to any particular transaction - * and thus doesn't block transaction commit. If the caller uses this reserved - * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() - * on the parent handle will dispose the reserved one. Reserved handle has to - * be converted to a normal handle using jbd2_journal_start_reserved() before - * it can be used. - * - * Return a pointer to a newly allocated handle, or an ERR_PTR() value - * on failure. - */ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, gfp_t gfp_mask, unsigned int type, unsigned int line_no) @@ -478,6 +459,25 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, EXPORT_SYMBOL(jbd2__journal_start); +/** + * handle_t *jbd2_journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. Additionally, if rsv_blocks > 0, we also create another + * handle with rsv_blocks reserved blocks in the journal. This handle is + * is stored in h_rsv_handle. It is not attached to any particular transaction + * and thus doesn't block transaction commit. If the caller uses this reserved + * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() + * on the parent handle will dispose the reserved one. Reserved handle has to + * be converted to a normal handle using jbd2_journal_start_reserved() before + * it can be used. + * + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. + */ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); @@ -680,6 +680,12 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); handle->h_buffer_credits = nblocks; + /* + * Restore the original nofs context because the journal restart + * is basically the same thing as journal stop and start. + * start_this_handle will start a new nofs context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); ret = start_this_handle(journal, handle, gfp_mask); return ret; } @@ -1066,10 +1072,10 @@ out: * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes * - * Returns an error code or 0 on success. + * Returns: error code or 0 on success. * * In full data journalling mode the buffer may be of type BJ_AsyncData, - * because we're write()ing a buffer which is also part of a shared mapping. + * because we're ``write()ing`` a buffer which is also part of a shared mapping. */ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 06a71dbd4833..389ea53ea487 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1366,7 +1366,7 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, jffs2_add_ino_cache(c, f->inocache); } if (!f->inocache) { - JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino); + JFFS2_ERROR("requested to read a nonexistent ino %u\n", ino); return -ENOENT; } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index bb1da1feafeb..a21f0e9eecd4 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2205,7 +2205,7 @@ static void lbmIODone(struct bio *bio) bp->l_flag |= lbmDONE; - if (bio->bi_error) { + if (bio->bi_status) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 489aaa1403e5..65120a471729 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -280,7 +280,7 @@ static void metapage_read_end_io(struct bio *bio) { struct page *page = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_read_end_io: I/O error\n"); SetPageError(page); } @@ -337,7 +337,7 @@ static void metapage_write_end_io(struct bio *bio) BUG_ON(!PagePrivate(page)); - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_write_end_io: I/O error\n"); SetPageError(page); } @@ -664,6 +664,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, INCREMENT(mpStat.pagealloc); mp = alloc_metapage(GFP_NOFS); mp->page = page; + mp->sb = inode->i_sb; mp->flag = 0; mp->xflag = COMMIT_PAGE; mp->count = 1; @@ -711,7 +712,8 @@ void force_metapage(struct metapage *mp) get_page(page); lock_page(page); set_page_dirty(page); - write_one_page(page, 1); + if (write_one_page(page)) + jfs_error(mp->sb, "write_one_page() failed\n"); clear_bit(META_forcewrite, &mp->flag); put_page(page); } @@ -756,7 +758,8 @@ void release_metapage(struct metapage * mp) set_page_dirty(page); if (test_bit(META_sync, &mp->flag)) { clear_bit(META_sync, &mp->flag); - write_one_page(page, 1); + if (write_one_page(page)) + jfs_error(mp->sb, "write_one_page() failed\n"); lock_page(page); /* write_one_page unlocks the page */ } } else if (mp->lsn) /* discard_metapage doesn't remove it */ diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h index a869fb4a20d6..8b0ee514eb84 100644 --- a/fs/jfs/jfs_metapage.h +++ b/fs/jfs/jfs_metapage.h @@ -38,6 +38,7 @@ struct metapage { /* implementation */ struct page *page; + struct super_block *sb; unsigned int logical_size; /* Journal management */ diff --git a/fs/libfs.c b/fs/libfs.c index a8b62e5d43a9..3aabe553fc45 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -507,7 +507,7 @@ EXPORT_SYMBOL(simple_write_end); * to pass it an appropriate max_reserved value to avoid collisions. */ int simple_fill_super(struct super_block *s, unsigned long magic, - struct tree_descr *files) + const struct tree_descr *files) { struct inode *inode; struct dentry *root; @@ -974,7 +974,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, int err; int ret; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; @@ -991,6 +991,10 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, out: inode_unlock(inode); + /* check and advance again to catch errors after syncing out buffers */ + err = file_check_and_advance_wb_err(file); + if (ret == 0) + ret = err; return ret; } EXPORT_SYMBOL(__generic_file_fsync); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 41e491b8e5d7..27d577dbe51a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -69,6 +69,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) goto out_nobind; + host->h_nlmclnt_ops = nlm_init->nlmclnt_ops; return host; out_nobind: nlmclnt_release_host(host); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 112952037933..066ac313ae5c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -150,17 +150,22 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req) * @host: address of a valid nlm_host context representing the NLM server * @cmd: fcntl-style file lock operation to perform * @fl: address of arguments for the lock operation + * @data: address of data to be sent to callback operations * */ -int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) +int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data) { struct nlm_rqst *call; int status; + const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops; call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call) + nlmclnt_ops->nlmclnt_alloc_call(data); + nlmclnt_locks_init_private(fl, host); if (!fl->fl_u.nfs_fl.owner) { /* lockowner allocation has failed */ @@ -169,6 +174,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) } /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); + call->a_callback_data = data; if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { @@ -214,8 +220,12 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) void nlmclnt_release_call(struct nlm_rqst *call) { + const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; + if (!atomic_dec_and_test(&call->a_count)) return; + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) + nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); nlmclnt_release_host(call->a_host); nlmclnt_release_lockargs(call); kfree(call); @@ -687,6 +697,19 @@ out: return status; } +static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data) +{ + struct nlm_rqst *req = data; + const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops; + bool defer_call = false; + + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare) + defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data); + + if (!defer_call) + rpc_call_start(task); +} + static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) { struct nlm_rqst *req = data; @@ -720,6 +743,7 @@ die: } static const struct rpc_call_ops nlmclnt_unlock_ops = { + .rpc_call_prepare = nlmclnt_unlock_prepare, .rpc_call_done = nlmclnt_unlock_callback, .rpc_release = nlmclnt_rpc_release, }; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e7c8b9c76e48..5d481e8a1b5d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -132,6 +132,8 @@ lockd(void *vrqstp) { int err = 0; struct svc_rqst *rqstp = vrqstp; + struct net *net = &init_net; + struct lockd_net *ln = net_generic(net, lockd_net_id); /* try_to_freeze() is called from svc_recv() */ set_freezable(); @@ -176,6 +178,8 @@ lockd(void *vrqstp) if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); return 0; } @@ -270,8 +274,6 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); - cancel_delayed_work_sync(&ln->grace_period_end); - locks_end_grace(&ln->lockd_manager); svc_shutdown_net(serv, net); dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 5581e020644b..3507c80d1d4b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -870,15 +870,15 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) if (!(block = nlmsvc_find_block(cookie))) return; - if (block) { - if (status == nlm_lck_denied_grace_period) { - /* Try again in a couple of seconds */ - nlmsvc_insert_block(block, 10 * HZ); - } else { - /* Lock is now held by client, or has been rejected. - * In both cases, the block should be removed. */ - nlmsvc_unlink_block(block); - } + if (status == nlm_lck_denied_grace_period) { + /* Try again in a couple of seconds */ + nlmsvc_insert_block(block, 10 * HZ); + } else { + /* + * Lock is now held by client, or has been rejected. + * In both cases, the block should be removed. + */ + nlmsvc_unlink_block(block); } nlmsvc_release_block(block); } diff --git a/fs/locks.c b/fs/locks.c index 26811321d39b..afefeb4ad6de 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1858,8 +1858,8 @@ EXPORT_SYMBOL(generic_setlease); * * Call this to establish a lease on the file. The "lease" argument is not * used for F_UNLCK requests and may be NULL. For commands that set or alter - * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set; - * if not, this function will return -ENOLCK (and generate a scary-looking + * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be + * set; if not, this function will return -ENOLCK (and generate a scary-looking * stack trace). * * The "priv" pointer is passed directly to the lm_setup function as-is. It @@ -1972,15 +1972,13 @@ EXPORT_SYMBOL(locks_lock_inode_wait); * @cmd: the type of lock to apply. * * Apply a %FL_FLOCK style lock to an open file descriptor. - * The @cmd can be one of + * The @cmd can be one of: * - * %LOCK_SH -- a shared lock. - * - * %LOCK_EX -- an exclusive lock. - * - * %LOCK_UN -- remove an existing lock. - * - * %LOCK_MAND -- a `mandatory' flock. This exists to emulate Windows Share Modes. + * - %LOCK_SH -- a shared lock. + * - %LOCK_EX -- an exclusive lock. + * - %LOCK_UN -- remove an existing lock. + * - %LOCK_MAND -- a 'mandatory' flock. + * This exists to emulate Windows Share Modes. * * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other * processes read and write access respectively. @@ -2086,26 +2084,22 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) +int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock file_lock; - struct flock flock; int error; - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; error = -EINVAL; - if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock_to_posix_lock(filp, &file_lock, &flock); + error = flock_to_posix_lock(filp, &file_lock, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_GETLK; @@ -2117,15 +2111,12 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - flock.l_type = file_lock.fl_type; + flock->l_type = file_lock.fl_type; if (file_lock.fl_type != F_UNLCK) { - error = posix_lock_to_flock(&flock, &file_lock); + error = posix_lock_to_flock(flock, &file_lock); if (error) goto rel_priv; } - error = -EFAULT; - if (!copy_to_user(l, &flock, sizeof(flock))) - error = 0; rel_priv: locks_release_private(&file_lock); out: @@ -2218,26 +2209,16 @@ check_fmode_for_setlk(struct file_lock *fl) * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock __user *l) + struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct flock flock; - struct inode *inode; + struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - inode = locks_inode(filp); - - /* - * This might block, so we do it before checking the inode. - */ - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2246,7 +2227,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } - error = flock_to_posix_lock(filp, file_lock, &flock); + error = flock_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2261,7 +2242,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, switch (cmd) { case F_OFD_SETLK: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLK; @@ -2270,7 +2251,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, break; case F_OFD_SETLKW: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLKW; @@ -2315,26 +2296,22 @@ out: /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) +int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock file_lock; - struct flock64 flock; int error; - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; error = -EINVAL; - if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock64_to_posix_lock(filp, &file_lock, &flock); + error = flock64_to_posix_lock(filp, &file_lock, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_GETLK64; @@ -2346,13 +2323,9 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - flock.l_type = file_lock.fl_type; + flock->l_type = file_lock.fl_type; if (file_lock.fl_type != F_UNLCK) - posix_lock_to_flock64(&flock, &file_lock); - - error = -EFAULT; - if (!copy_to_user(l, &flock, sizeof(flock))) - error = 0; + posix_lock_to_flock64(flock, &file_lock); locks_release_private(&file_lock); out: @@ -2363,26 +2336,16 @@ out: * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock64 __user *l) + struct flock64 *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct flock64 flock; - struct inode *inode; + struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - /* - * This might block, so we do it before checking the inode. - */ - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; - - inode = locks_inode(filp); - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2391,7 +2354,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } - error = flock64_to_posix_lock(filp, file_lock, &flock); + error = flock64_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2406,7 +2369,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, switch (cmd) { case F_OFD_SETLK: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLK64; @@ -2415,7 +2378,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, break; case F_OFD_SETLKW: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLKW64; @@ -2504,7 +2467,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) .fl_owner = filp, .fl_pid = current->tgid, .fl_file = filp, - .fl_flags = FL_FLOCK, + .fl_flags = FL_FLOCK | FL_CLOSE, .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; diff --git a/fs/mbcache.c b/fs/mbcache.c index b19be429d655..d818fd236787 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -10,13 +10,14 @@ /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in - * mb_cache_entry_delete_block()). + * mb_cache_entry_delete()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. - * They use hash of a block contents as a key and block number as a value. - * That's why keys need not be unique (different xattr blocks may end up having - * the same hash). However block number always uniquely identifies a cache - * entry. + * Ext4 also uses it for deduplication of xattr values stored in inodes. + * They use hash of data as a key and provide a value that may represent a + * block or inode number. That's why keys need not be unique (hash of different + * data may be the same). However user provided value always uniquely + * identifies a cache entry. * * We provide functions for creation and removal of entries, search by key, * and a special "delete entry with given key-value pair" operation. Fixed @@ -62,15 +63,15 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, * @cache - cache where the entry should be created * @mask - gfp mask with which the entry should be allocated * @key - key of the entry - * @block - block that contains data - * @reusable - is the block reusable by other inodes? + * @value - value of the entry + * @reusable - is the entry reusable by others? * - * Creates entry in @cache with key @key and records that data is stored in - * block @block. The function returns -EBUSY if entry with the same key - * and for the same block already exists in cache. Otherwise 0 is returned. + * Creates entry in @cache with key @key and value @value. The function returns + * -EBUSY if entry with the same key and value already exists in cache. + * Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - sector_t block, bool reusable) + u64 value, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; @@ -91,12 +92,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, /* One ref for hash, one ref returned */ atomic_set(&entry->e_refcnt, 1); entry->e_key = key; - entry->e_block = block; + entry->e_value = value; entry->e_reusable = reusable; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { - if (dup->e_key == key && dup->e_block == block) { + if (dup->e_key == key && dup->e_value == value) { hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); return -EBUSY; @@ -187,13 +188,13 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, EXPORT_SYMBOL(mb_cache_entry_find_next); /* - * mb_cache_entry_get - get a cache entry by block number (and key) + * mb_cache_entry_get - get a cache entry by value (and key) * @cache - cache we work with - * @key - key of block number @block - * @block - block number + * @key - key + * @value - value */ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, - sector_t block) + u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; @@ -202,7 +203,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_block == block) { + if (entry->e_key == key && entry->e_value == value) { atomic_inc(&entry->e_refcnt); goto out; } @@ -214,15 +215,14 @@ out: } EXPORT_SYMBOL(mb_cache_entry_get); -/* mb_cache_entry_delete_block - remove information about block from cache +/* mb_cache_entry_delete - remove a cache entry * @cache - cache we work with - * @key - key of block @block - * @block - block number + * @key - key + * @value - value * - * Remove entry from cache @cache with key @key with data stored in @block. + * Remove entry from cache @cache with key @key and value @value. */ -void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, - sector_t block) +void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; @@ -231,7 +231,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_block == block) { + if (entry->e_key == key && entry->e_value == value) { /* We keep hash list reference to keep entry alive */ hlist_bl_del_init(&entry->e_hash_list); hlist_bl_unlock(head); @@ -248,7 +248,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, } hlist_bl_unlock(head); } -EXPORT_SYMBOL(mb_cache_entry_delete_block); +EXPORT_SYMBOL(mb_cache_entry_delete); /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 7edc9b395700..baa9721f1299 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -57,7 +57,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 4c57c9af6946..2d1ca08870f7 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -142,7 +142,7 @@ changed: return -EAGAIN; } -static inline int get_block(struct inode * inode, sector_t block, +static int get_block(struct inode * inode, sector_t block, struct buffer_head *bh, int create) { int err = -EIO; diff --git a/fs/mount.h b/fs/mount.h index bf1fda6eed8f..de45d9e76748 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -58,6 +58,7 @@ struct mount { struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ + struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/mpage.c b/fs/mpage.c index baff8f820c29..2e4c41ccb5c9 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,8 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, op_is_write(bio_op(bio)), bio->bi_error); + page_endio(page, op_is_write(bio_op(bio)), + blk_status_to_errno(bio->bi_status)); } bio_put(bio); @@ -344,6 +345,7 @@ confused: * * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be * submitted in the following order: + * * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 * * because the indirect block has to be read to get the mappings of blocks @@ -614,6 +616,7 @@ alloc_new: goto confused; wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/namei.c b/fs/namei.c index 9a7f8bd748d8..e0b46eb0e212 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1008,7 +1008,7 @@ static int may_linkat(struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (inode_owner_or_capable(inode) || safe_hardlink_source(inode)) + if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) return 0; audit_log_link_denied("linkat", link); @@ -2142,7 +2142,6 @@ OK: static const char *path_init(struct nameidata *nd, unsigned flags) { - int retval = 0; const char *s = nd->name->name; if (!*s) @@ -2154,13 +2153,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; - if (*s) { - if (!d_can_lookup(root)) - return ERR_PTR(-ENOTDIR); - retval = inode_permission(inode, MAY_EXEC); - if (retval) - return ERR_PTR(retval); - } + if (*s && unlikely(!d_can_lookup(root))) + return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { @@ -2258,6 +2252,35 @@ static inline int lookup_last(struct nameidata *nd) return walk_component(nd, 0); } +static int handle_lookup_down(struct nameidata *nd) +{ + struct path path = nd->path; + struct inode *inode = nd->inode; + unsigned seq = nd->seq; + int err; + + if (nd->flags & LOOKUP_RCU) { + /* + * don't bother with unlazy_walk on failure - we are + * at the very beginning of walk, so we lose nothing + * if we simply redo everything in non-RCU mode + */ + if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq))) + return -ECHILD; + } else { + dget(path.dentry); + err = follow_managed(&path, nd); + if (unlikely(err < 0)) + return err; + inode = d_backing_inode(path.dentry); + seq = 0; + } + path_to_nameidata(&path, nd); + nd->inode = inode; + nd->seq = seq; + return 0; +} + /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { @@ -2266,6 +2289,15 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path if (IS_ERR(s)) return PTR_ERR(s); + + if (unlikely(flags & LOOKUP_DOWN)) { + err = handle_lookup_down(nd); + if (unlikely(err < 0)) { + terminate_walk(nd); + return err; + } + } + while (!(err = link_path_walk(s, nd)) && ((err = lookup_last(nd)) > 0)) { s = trailing_symlink(nd); @@ -4300,6 +4332,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: + * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on @@ -4330,11 +4363,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, { int error; bool is_dir = d_is_dir(old_dentry); - const unsigned char *old_name; struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; + struct name_snapshot old_name; if (source == target) return 0; @@ -4381,7 +4414,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); @@ -4436,14 +4469,14 @@ out: inode_unlock(target); dput(new_dentry); if (!error) { - fsnotify_move(old_dir, new_dir, old_name, is_dir, + fsnotify_move(old_dir, new_dir, old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, new_is_dir, NULL, new_dentry); } } - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); return error; } @@ -4766,7 +4799,7 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) struct page *page; void *fsdata; int err; - unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; + unsigned int flags = 0; if (nofs) flags |= AOP_FLAG_NOFS; diff --git a/fs/namespace.c b/fs/namespace.c index b3b115bd4e1e..81f934b5d571 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); + INIT_LIST_HEAD(&mnt->mnt_umounting); init_fs_pin(&mnt->mnt_umount, drop_mountpoint); } return mnt; @@ -3238,7 +3239,6 @@ static void __init init_mount_tree(void) void __init mnt_init(void) { - unsigned u; int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), @@ -3247,22 +3247,17 @@ void __init mnt_init(void) mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, - 0, + HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, - 0, + HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - for (u = 0; u <= m_hash_mask; u++) - INIT_HLIST_HEAD(&mount_hashtable[u]); - for (u = 0; u <= mp_hash_mask; u++) - INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - kernfs_init(); err = sysfs_init(); @@ -3462,8 +3457,9 @@ static void mntns_put(struct ns_common *ns) static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct fs_struct *fs = current->fs; - struct mnt_namespace *mnt_ns = to_mnt_ns(ns); + struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; struct path root; + int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || @@ -3474,15 +3470,20 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) return -EINVAL; get_mnt_ns(mnt_ns); - put_mnt_ns(nsproxy->mnt_ns); + old_mnt_ns = nsproxy->mnt_ns; nsproxy->mnt_ns = mnt_ns; /* Find the root */ - root.mnt = &mnt_ns->root->mnt; - root.dentry = mnt_ns->root->mnt.mnt_root; - path_get(&root); - while(d_mountpoint(root.dentry) && follow_down_one(&root)) - ; + err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, + "/", LOOKUP_DOWN, &root); + if (err) { + /* revert to old namespace */ + nsproxy->mnt_ns = old_mnt_ns; + put_mnt_ns(mnt_ns); + return err; + } + + put_mnt_ns(old_mnt_ns); /* Update the pwd and root */ set_fs_pwd(fs, &root); diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 0c3905e0542e..6719c0be674d 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_fault *vmf) * -- nyc */ count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; } diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f31fd0dd92c6..69d02cf8cf37 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -123,11 +123,6 @@ config PNFS_BLOCK depends on NFS_V4_1 && BLK_DEV_DM default NFS_V4 -config PNFS_OBJLAYOUT - tristate - depends on NFS_V4_1 && SCSI_OSD_ULD - default NFS_V4 - config PNFS_FLEXFILE_LAYOUT tristate depends on NFS_V4_1 && NFS_V3 diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 6abdda209642..98f4e5728a67 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -31,6 +31,5 @@ nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ -obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 0ca370d23ddb..d8863a804b15 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -188,7 +188,7 @@ static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) @@ -319,7 +319,7 @@ static void bl_end_io_write(struct bio *bio) struct parallel_io *par = bio->bi_private; struct nfs_pgio_header *header = par->data; - if (bio->bi_error) { + if (bio->bi_status) { if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 773774531aff..73a1f928226c 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); /* * Listen for a request on the socket */ @@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp) continue; svc_process(rqstp); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { - if (try_to_freeze()) - continue; + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); @@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - schedule(); + if (!kthread_should_stop()) + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } - flush_signals(current); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -221,14 +229,14 @@ err_bind: static struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; @@ -280,7 +288,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f073a6d2c6a5..52479f180ea1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -131,10 +131,11 @@ restart: if (!inode) continue; if (!nfs_sb_active(inode->i_sb)) { - rcu_read_lock(); + rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); + rcu_read_lock(); goto restart; } return inode; @@ -170,10 +171,11 @@ restart: if (!inode) continue; if (!nfs_sb_active(inode->i_sb)) { - rcu_read_lock(); + rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); + rcu_read_lock(); goto restart; } return inode; @@ -317,31 +319,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, static u32 do_callback_layoutrecall(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - u32 res; - - dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); if (args->cbl_recall_type == RETURN_FILE) - res = initiate_file_draining(clp, args); - else - res = initiate_bulk_draining(clp, args); - dprintk("%s returning %i\n", __func__, res); - return res; - + return initiate_file_draining(clp, args); + return initiate_bulk_draining(clp, args); } __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, void *dummy, struct cb_process_state *cps) { - u32 res; - - dprintk("%s: -->\n", __func__); + u32 res = NFS4ERR_OP_NOT_IN_SESSION; if (cps->clp) res = do_callback_layoutrecall(cps->clp, args); - else - res = NFS4ERR_OP_NOT_IN_SESSION; - - dprintk("%s: exit with status = %d\n", __func__, res); return cpu_to_be32(res); } @@ -364,8 +353,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; - dprintk("%s: -->\n", __func__); - if (!clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; @@ -384,8 +371,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, goto found; } rcu_read_unlock(); - dprintk("%s: layout type %u not found\n", - __func__, dev->cbd_layout_type); continue; } @@ -395,8 +380,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, out: kfree(args->devs); - dprintk("%s: exit with status = %u\n", - __func__, be32_to_cpu(res)); return res; } @@ -417,16 +400,11 @@ static __be32 validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot, const struct cb_sequenceargs * args) { - dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n", - __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr); - if (args->csa_slotid > tbl->server_highest_slotid) return htonl(NFS4ERR_BADSLOT); /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { - dprintk("%s seqid %u is a replay\n", - __func__, args->csa_sequenceid); if (nfs4_test_locked_slot(tbl, slot->slot_nr)) return htonl(NFS4ERR_DELAY); /* Signal process_op to set this error on next op */ @@ -480,15 +458,6 @@ static bool referring_call_exists(struct nfs_client *clp, for (j = 0; j < rclist->rcl_nrefcalls; j++) { ref = &rclist->rcl_refcalls[j]; - - dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " - "slotid %u\n", __func__, - ((u32 *)&rclist->rcl_sessionid.data)[0], - ((u32 *)&rclist->rcl_sessionid.data)[1], - ((u32 *)&rclist->rcl_sessionid.data)[2], - ((u32 *)&rclist->rcl_sessionid.data)[3], - ref->rc_sequenceid, ref->rc_slotid); - status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, ref->rc_sequenceid, HZ >> 1) < 0; if (status) @@ -593,8 +562,6 @@ out: res->csr_status = status; trace_nfs4_cb_sequence(args, res, status); - dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, - ntohl(status), ntohl(res->csr_status)); return status; } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d051fc3583a9..390ac9c39c59 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -171,8 +171,6 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } hdr->nops = ntohl(*p); - dprintk("%s: minorversion %d nops %d\n", __func__, - hdr->minorversion, hdr->nops); return 0; } @@ -192,11 +190,8 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr status = decode_fh(xdr, &args->fh); if (unlikely(status != 0)) - goto out; - status = decode_bitmap(xdr, args->bitmap); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return status; + return decode_bitmap(xdr, args->bitmap); } static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) @@ -206,17 +201,12 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) - goto out; + return status; p = read_buf(xdr, 4); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_RESOURCE); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); args->truncate = ntohl(*p); - status = decode_fh(xdr, &args->fh); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return decode_fh(xdr, &args->fh); } #if defined(CONFIG_NFS_V4_1) @@ -235,10 +225,8 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, uint32_t iomode; p = read_buf(xdr, 4 * sizeof(uint32_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); args->cbl_layout_type = ntohl(*p++); /* Depite the spec's xdr, iomode really belongs in the FILE switch, @@ -252,37 +240,23 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, args->cbl_range.iomode = iomode; status = decode_fh(xdr, &args->cbl_fh); if (unlikely(status != 0)) - goto out; + return status; p = read_buf(xdr, 2 * sizeof(uint64_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); - status = decode_layout_stateid(xdr, &args->cbl_stateid); - if (unlikely(status != 0)) - goto out; + return decode_layout_stateid(xdr, &args->cbl_stateid); } else if (args->cbl_recall_type == RETURN_FSID) { p = read_buf(xdr, 2 * sizeof(uint64_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_fsid.major); p = xdr_decode_hyper(p, &args->cbl_fsid.minor); - } else if (args->cbl_recall_type != RETURN_ALL) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } - dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", - __func__, - args->cbl_layout_type, iomode, - args->cbl_layoutchanged, args->cbl_recall_type); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + } else if (args->cbl_recall_type != RETURN_ALL) + return htonl(NFS4ERR_BADXDR); + return 0; } static @@ -437,12 +411,11 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, status = decode_sessionid(xdr, &args->csa_sessionid); if (status) - goto out; + return status; - status = htonl(NFS4ERR_RESOURCE); p = read_buf(xdr, 5 * sizeof(uint32_t)); if (unlikely(p == NULL)) - goto out; + return htonl(NFS4ERR_RESOURCE); args->csa_addr = svc_addr(rqstp); args->csa_sequenceid = ntohl(*p++); @@ -456,7 +429,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, sizeof(*args->csa_rclists), GFP_KERNEL); if (unlikely(args->csa_rclists == NULL)) - goto out; + return htonl(NFS4ERR_RESOURCE); for (i = 0; i < args->csa_nrclists; i++) { status = decode_rc_list(xdr, &args->csa_rclists[i]); @@ -466,27 +439,13 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, } } } - status = 0; - - dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u " - "highestslotid %u cachethis %d nrclists %u\n", - __func__, - ((u32 *)&args->csa_sessionid)[0], - ((u32 *)&args->csa_sessionid)[1], - ((u32 *)&args->csa_sessionid)[2], - ((u32 *)&args->csa_sessionid)[3], - args->csa_sequenceid, args->csa_slotid, - args->csa_highestslotid, args->csa_cachethis, - args->csa_nrclists); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return 0; out_free: for (i = 0; i < args->csa_nrclists; i++) kfree(args->csa_rclists[i].rcl_refcalls); kfree(args->csa_rclists); - goto out; + return status; } static __be32 decode_recallany_args(struct svc_rqst *rqstp, @@ -557,11 +516,8 @@ static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream status = decode_fh(xdr, &args->cbnl_fh); if (unlikely(status != 0)) - goto out; - status = decode_lockowner(xdr, args); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return status; + return decode_lockowner(xdr, args); } #endif /* CONFIG_NFS_V4_1 */ @@ -707,7 +663,6 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } @@ -734,11 +689,11 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, __be32 status = res->csr_status; if (unlikely(status != 0)) - goto out; + return status; status = encode_sessionid(xdr, &res->csr_sessionid); if (status) - goto out; + return status; p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) @@ -748,9 +703,7 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, *p++ = htonl(res->csr_slotid); *p++ = htonl(res->csr_highestslotid); *p++ = htonl(res->csr_target_highestslotid); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return 0; } static __be32 @@ -800,7 +753,6 @@ static void nfs4_callback_free_slot(struct nfs4_session *session, * A single slot, so highest used slotid is either 0 or -1 */ nfs4_free_slot(tbl, slot); - nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } @@ -871,14 +823,10 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp, long maxlen; __be32 res; - dprintk("%s: start\n", __func__); status = decode_op_hdr(xdr_in, &op_nr); if (unlikely(status)) return status; - dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", - __func__, cps->minorversion, nop, op_nr); - switch (cps->minorversion) { case 0: status = preprocess_nfs4_op(op_nr, &op); @@ -917,7 +865,6 @@ encode_hdr: return res; if (op->encode_res != NULL && status == 0) status = op->encode_res(rqstp, xdr_out, resp); - dprintk("%s: done, status = %d\n", __func__, ntohl(status)); return status; } @@ -937,8 +884,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r }; unsigned int nops = 0; - dprintk("%s: start\n", __func__); - xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); @@ -977,7 +922,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r *hdr_res.nops = htonl(nops); nfs4_cb_free_slot(&cps); nfs_put_client(cps.clp); - dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; out_invalidcred: diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 04d15a0045e3..ee5ddbd36088 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -218,6 +218,7 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); + rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); } #else @@ -240,8 +241,6 @@ static void pnfs_init_server(struct nfs_server *server) */ void nfs_free_client(struct nfs_client *clp) { - dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); - nfs_fscache_release_client_cookie(clp); /* -EIO all pending I/O */ @@ -256,8 +255,6 @@ void nfs_free_client(struct nfs_client *clp) kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree(clp); - - dprintk("<-- nfs_free_client()\n"); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -271,7 +268,6 @@ void nfs_put_client(struct nfs_client *clp) if (!clp) return; - dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); nn = net_generic(clp->cl_net, nfs_net_id); if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { @@ -382,9 +378,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, } smp_rmb(); - - dprintk("<-- %s found nfs_client %p for %s\n", - __func__, clp, cl_init->hostname ?: ""); return clp; } @@ -403,9 +396,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) return NULL; } - dprintk("--> nfs_get_client(%s,v%u)\n", - cl_init->hostname, rpc_ops->version); - /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); @@ -430,8 +420,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); - dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", - cl_init->hostname, PTR_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); @@ -558,6 +546,7 @@ static int nfs_start_lockd(struct nfs_server *server) .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, + .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, }; if (nlm_init.nfs_version > 3) @@ -624,27 +613,21 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, { int error; - if (clp->cl_cons_state == NFS_CS_READY) { - /* the client is already initialised */ - dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); + /* the client is already initialised */ + if (clp->cl_cons_state == NFS_CS_READY) return clp; - } /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); - if (error < 0) - goto error; - nfs_mark_client_ready(clp, NFS_CS_READY); + nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); + if (error < 0) { + nfs_put_client(clp); + clp = ERR_PTR(error); + } return clp; - -error: - nfs_mark_client_ready(clp, error); - nfs_put_client(clp); - dprintk("<-- nfs_init_client() = xerror %d\n", error); - return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_init_client); @@ -668,8 +651,6 @@ static int nfs_init_server(struct nfs_server *server, struct nfs_client *clp; int error; - dprintk("--> nfs_init_server()\n"); - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) @@ -677,10 +658,8 @@ static int nfs_init_server(struct nfs_server *server, /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); + if (IS_ERR(clp)) return PTR_ERR(clp); - } server->nfs_client = clp; @@ -725,13 +704,11 @@ static int nfs_init_server(struct nfs_server *server, server->mountd_protocol = data->mount_server.protocol; server->namelen = data->namlen; - dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); return 0; error: server->nfs_client = NULL; nfs_put_client(clp); - dprintk("<-- nfs_init_server() = xerror %d\n", error); return error; } @@ -798,12 +775,10 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs struct nfs_client *clp = server->nfs_client; int error; - dprintk("--> nfs_probe_fsinfo()\n"); - if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) - goto out_error; + return error; } fsinfo.fattr = fattr; @@ -811,7 +786,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) - goto out_error; + return error; nfs_server_set_fsinfo(server, &fsinfo); @@ -826,12 +801,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs server->namelen = pathinfo.max_namelen; } - dprintk("<-- nfs_probe_fsinfo() = 0\n"); return 0; - -out_error: - dprintk("nfs_probe_fsinfo: error = %d\n", -error); - return error; } EXPORT_SYMBOL_GPL(nfs_probe_fsinfo); @@ -927,8 +897,6 @@ EXPORT_SYMBOL_GPL(nfs_alloc_server); */ void nfs_free_server(struct nfs_server *server) { - dprintk("--> nfs_free_server()\n"); - nfs_server_remove_lists(server); if (server->destroy != NULL) @@ -946,7 +914,6 @@ void nfs_free_server(struct nfs_server *server) nfs_free_iostats(server->io_stats); kfree(server); nfs_release_automount_timer(); - dprintk("<-- nfs_free_server()\n"); } EXPORT_SYMBOL_GPL(nfs_free_server); @@ -1026,10 +993,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fattr *fattr_fsinfo; int error; - dprintk("--> nfs_clone_server(,%llx:%llx,)\n", - (unsigned long long) fattr->fsid.major, - (unsigned long long) fattr->fsid.minor); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1061,10 +1024,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - dprintk("Cloned FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - error = nfs_start_lockd(server); if (error < 0) goto out_free_server; @@ -1073,13 +1032,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, server->mount_time = jiffies; nfs_free_fattr(fattr_fsinfo); - dprintk("<-- nfs_clone_server() = %p\n", server); return server; out_free_server: nfs_free_fattr(fattr_fsinfo); nfs_free_server(server); - dprintk("<-- nfs_clone_server() = error %d\n", error); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f92ba8d6c556..2ac00bf4ecf1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .iterate_shared = nfs_readdir, + .iterate = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, @@ -145,7 +145,6 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { - atomic_t refcount; int size; int eof_index; u64 last_cookie; @@ -171,27 +170,6 @@ typedef struct { } nfs_readdir_descriptor_t; /* - * The caller is responsible for calling nfs_readdir_release_array(page) - */ -static -struct nfs_cache_array *nfs_readdir_get_array(struct page *page) -{ - void *ptr; - if (page == NULL) - return ERR_PTR(-EIO); - ptr = kmap(page); - if (ptr == NULL) - return ERR_PTR(-ENOMEM); - return ptr; -} - -static -void nfs_readdir_release_array(struct page *page) -{ - kunmap(page); -} - -/* * we are freeing strings created by nfs_add_to_readdir_array() */ static @@ -201,18 +179,9 @@ void nfs_readdir_clear_array(struct page *page) int i; array = kmap_atomic(page); - if (atomic_dec_and_test(&array->refcount)) - for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); - kunmap_atomic(array); -} - -static bool grab_page(struct page *page) -{ - struct nfs_cache_array *array = kmap_atomic(page); - bool res = atomic_inc_not_zero(&array->refcount); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); kunmap_atomic(array); - return res; } /* @@ -239,13 +208,10 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { - struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array *array = kmap(page); struct nfs_cache_array_entry *cache_entry; int ret; - if (IS_ERR(array)) - return PTR_ERR(array); - cache_entry = &array->array[array->size]; /* Check that this entry lies within the page bounds */ @@ -264,7 +230,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (entry->eof != 0) array->eof_index = array->size; out: - nfs_readdir_release_array(page); + kunmap(page); return ret; } @@ -353,11 +319,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array; int status; - array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) { - status = PTR_ERR(array); - goto out; - } + array = kmap(desc->page); if (*desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); @@ -369,8 +331,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) desc->current_index += array->size; desc->page_index++; } - nfs_readdir_release_array(desc->page); -out: + kunmap(desc->page); return status; } @@ -606,13 +567,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = nfs_readdir_get_array(page); - if (!IS_ERR(array)) { - array->eof_index = array->size; - status = 0; - nfs_readdir_release_array(page); - } else - status = PTR_ERR(array); + array = kmap(page); + array->eof_index = array->size; + status = 0; + kunmap(page); } put_page(scratch); @@ -674,13 +632,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; } - array = nfs_readdir_get_array(page); - if (IS_ERR(array)) { - status = PTR_ERR(array); - goto out_label_free; - } + array = kmap(page); memset(array, 0, sizeof(struct nfs_cache_array)); - atomic_set(&array->refcount, 1); array->eof_index = -1; status = nfs_readdir_alloc_pages(pages, array_size); @@ -703,8 +656,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, nfs_readdir_free_pages(pages, array_size); out_release_array: - nfs_readdir_release_array(page); -out_label_free: + kunmap(page); nfs4_label_free(entry.label); out: nfs_free_fattr(entry.fattr); @@ -743,7 +695,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) static void cache_page_release(nfs_readdir_descriptor_t *desc) { - nfs_readdir_clear_array(desc->page); + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); put_page(desc->page); desc->page = NULL; } @@ -751,16 +704,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct page *page; - - for (;;) { - page = read_cache_page(desc->file->f_mapping, + return read_cache_page(desc->file->f_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page) || grab_page(page)) - break; - put_page(page); - } - return page; } /* @@ -809,12 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array = NULL; struct nfs_open_dir_context *ctx = file->private_data; - array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) { - res = PTR_ERR(array); - goto out; - } - + array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; @@ -835,8 +775,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) if (array->eof_index >= 0) desc->eof = 1; - nfs_readdir_release_array(desc->page); -out: + kunmap(desc->page); cache_page_release(desc); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); @@ -966,11 +905,13 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { + struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -978,13 +919,16 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset >= 0) break; default: - return -EINVAL; + offset = -EINVAL; + goto out; } if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } +out: + inode_unlock(inode); return offset; } @@ -2002,29 +1946,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); -static void -nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data) -{ - struct dentry *old_dentry = data->old_dentry; - struct dentry *new_dentry = data->new_dentry; - struct inode *old_inode = d_inode(old_dentry); - struct inode *new_inode = d_inode(new_dentry); - - nfs_mark_for_revalidate(old_inode); - - switch (task->tk_status) { - case 0: - if (new_inode != NULL) - nfs_drop_nlink(new_inode); - d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, - nfs_save_change_attribute(data->new_dir)); - break; - case -ENOENT: - nfs_dentry_handle_enoent(old_dentry); - } -} - /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2055,7 +1976,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL; + struct dentry *dentry = NULL, *rehash = NULL; struct rpc_task *task; int error = -EBUSY; @@ -2078,8 +1999,10 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * To prevent any new references to the target during the * rename, we unhash the dentry in advance. */ - if (!d_unhashed(new_dentry)) + if (!d_unhashed(new_dentry)) { d_drop(new_dentry); + rehash = new_dentry; + } if (d_count(new_dentry) > 2) { int err; @@ -2096,6 +2019,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; + rehash = NULL; new_inode = NULL; } } @@ -2104,8 +2028,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, - nfs_complete_rename); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2115,9 +2038,27 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error == 0) error = task->tk_status; rpc_put_task(task); + nfs_mark_for_revalidate(old_inode); out: + if (rehash) + d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); + if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + /* + * The d_move() should be here instead of in an async RPC completion + * handler because we need the proper locks to move the dentry. If + * we're interrupted by a signal, the async RPC completion handler + * should mark the directories for revalidation. + */ + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); + } else if (error == -ENOENT) + nfs_dentry_handle_enoent(old_dentry); + /* new dentry created? */ if (dentry) dput(dentry); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c1b5fed7c863..6fb9fad2d1e6 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -392,16 +392,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) nfs_direct_req_release(dreq); } -static void nfs_direct_readpage_release(struct nfs_page *req) -{ - dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), - req->wb_bytes, - (long long)req_offset(req)); - nfs_release_request(req); -} - static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; @@ -426,7 +416,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); - nfs_direct_readpage_release(req); + nfs_release_request(req); } out_put: if (put_dreq(dreq)) @@ -700,16 +690,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) int status = data->task.tk_status; nfs_init_cinfo_from_dreq(&cinfo, dreq); - if (status < 0) { - dprintk("NFS: %5u commit failed with error %d.\n", - data->task.tk_pid, status); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { - dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); + if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data)) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } - dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 668213984d68..5713eb32a45e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -482,7 +482,7 @@ static int nfs_launder_page(struct page *page) inode->i_ino, (long long)page_offset(page)); nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_launder_page(inode, page); + return nfs_wb_page(inode, page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -697,14 +697,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!IS_ERR(l_ctx)) { status = nfs_iocounter_wait(l_ctx); nfs_put_lock_context(l_ctx); - if (status < 0) + /* NOTE: special case + * If we're signalled while cleaning up locks on process exit, we + * still need to complete the unlock. + */ + if (status < 0 && !(fl->fl_flags & FL_CLOSE)) return status; } - /* NOTE: special case - * If we're signalled while cleaning up locks on process exit, we - * still need to complete the unlock. - */ /* * Use local locking if mounted with "-onolock" or with appropriate * "-olocal_lock=" @@ -820,9 +820,23 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; - /* We're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + /* + * VFS doesn't require the open mode to match a flock() lock's type. + * NFS, however, may simulate flock() locking with posix locking which + * requires the open mode to match the lock type. + */ + switch (fl->fl_type) { + case F_UNLCK: return do_unlk(filp, cmd, fl, is_local); + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + return do_setlk(filp, cmd, fl, is_local); } EXPORT_SYMBOL_GPL(nfs_flock); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index acd30baca461..1cf85d65b748 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -921,11 +921,11 @@ fl_pnfs_update_layout(struct inode *ino, fl = FILELAYOUT_LSEG(lseg); status = filelayout_check_deviceid(lo, fl, gfp_flags); - if (status) + if (status) { + pnfs_put_lseg(lseg); lseg = ERR_PTR(status); + } out: - if (IS_ERR(lseg)) - pnfs_put_lseg(lseg); return lseg; } @@ -933,6 +933,7 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -959,6 +960,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, req->wb_context, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 42dedf2d625f..23542dc44a25 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -454,6 +454,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; /* fh */ + rc = -EIO; p = xdr_inline_decode(&stream, 4); if (!p) goto out_err_free; @@ -846,6 +847,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; retry: + pnfs_generic_pg_check_layout(pgio); /* Use full layout for now */ if (!pgio->pg_lseg) ff_layout_pg_get_read(pgio, req, false); @@ -894,6 +896,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int status; retry: + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -1800,16 +1803,16 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); if (!ds) - return PNFS_NOT_ATTEMPTED; + goto out_failed; ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) - return PNFS_NOT_ATTEMPTED; + goto out_failed; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); if (!ds_cred) - return PNFS_NOT_ATTEMPTED; + goto out_failed; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1839,6 +1842,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) sync, RPC_TASK_SOFTCONN); put_rpccred(ds_cred); return PNFS_ATTEMPTED; + +out_failed: + if (ff_layout_avoid_mds_available_ds(lseg)) + return PNFS_TRY_AGAIN; + return PNFS_NOT_ATTEMPTED; } static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) @@ -2354,10 +2362,21 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) return 0; } +static int +ff_layout_set_layoutdriver(struct nfs_server *server, + const struct nfs_fh *dummy) +{ +#if IS_ENABLED(CONFIG_NFS_V4_2) + server->caps |= NFS_CAP_LAYOUTSTATS; +#endif + return 0; +} + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, + .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, .alloc_lseg = ff_layout_alloc_lseg, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 457cfeb1d5c1..6df7a0cf5660 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -119,7 +119,13 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE) ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE; - if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) { + /* + * check for valid major/minor combination. + * currently we support dataserver which talk: + * v3, v4.0, v4.1, v4.2 + */ + if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) || + (ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) { dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__, i, ds_versions[i].version, ds_versions[i].minor_version); @@ -415,7 +421,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ - if (ds->ds_clp) { + if (!status) { max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f489a5a71bd5..1de93ba78dc9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -734,7 +734,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat, if (need_atime || nfs_need_revalidate_inode(inode)) { struct nfs_server *server = NFS_SERVER(inode); - nfs_readdirplus_parent_cache_miss(path->dentry); + if (!(server->flags & NFS_MOUNT_NOAC)) + nfs_readdirplus_parent_cache_miss(path->dentry); + else + nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); } else nfs_readdirplus_parent_cache_hit(path->dentry); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7b38fedb7e03..8701d7617964 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -7,6 +7,7 @@ #include <linux/security.h> #include <linux/crc32.h> #include <linux/nfs_page.h> +#include <linux/wait_bit.h> #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) @@ -398,7 +399,6 @@ extern struct file_system_type nfs4_referral_fs_type; bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); -void nfs_initialise_sb(struct super_block *); int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *, @@ -458,7 +458,6 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ -void nfs_clone_super(struct super_block *, struct nfs_mount_info *); void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); @@ -495,7 +494,6 @@ void nfs_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); -int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, @@ -756,9 +754,13 @@ static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: + case -EACCES: + case -EDQUOT: + case -EFBIG: case -EIO: case -ENOSPC: case -EROFS: + case -ESTALE: case -E2BIG: return true; default: diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 786f17580582..e5686be67be8 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -143,11 +143,8 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; - dprintk("--> nfs_d_automount()\n"); - - mnt = ERR_PTR(-ESTALE); if (IS_ROOT(path->dentry)) - goto out_nofree; + return ERR_PTR(-ESTALE); mnt = ERR_PTR(-ENOMEM); fh = nfs_alloc_fhandle(); @@ -155,13 +152,10 @@ struct vfsmount *nfs_d_automount(struct path *path) if (fh == NULL || fattr == NULL) goto out; - dprintk("%s: enter\n", __func__); - mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr); if (IS_ERR(mnt)) goto out; - dprintk("%s: done, success\n", __func__); mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); @@ -169,11 +163,6 @@ struct vfsmount *nfs_d_automount(struct path *path) out: nfs_free_fattr(fattr); nfs_free_fhandle(fh); -out_nofree: - if (IS_ERR(mnt)) - dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); - else - dprintk("<-- %s() = %p\n", __func__, mnt); return mnt; } @@ -248,27 +237,20 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, .fattr = fattr, .authflavor = authflavor, }; - struct vfsmount *mnt = ERR_PTR(-ENOMEM); + struct vfsmount *mnt; char *page = (char *) __get_free_page(GFP_USER); char *devname; - dprintk("--> nfs_do_submount()\n"); - - dprintk("%s: submounting on %pd2\n", __func__, - dentry); if (page == NULL) - goto out; + return ERR_PTR(-ENOMEM); + devname = nfs_devname(dentry, page, PAGE_SIZE); - mnt = (struct vfsmount *)devname; if (IS_ERR(devname)) - goto free_page; - mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); -free_page: - free_page((unsigned long)page); -out: - dprintk("%s: done\n", __func__); + mnt = ERR_CAST(devname); + else + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); - dprintk("<-- nfs_do_submount() = %p\n", mnt); + free_page((unsigned long)page); return mnt; } EXPORT_SYMBOL_GPL(nfs_do_submount); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index dc925b531f32..0c07b567118d 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -865,12 +865,63 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; } +static void nfs3_nlm_alloc_call(void *data) +{ + struct nfs_lock_context *l_ctx = data; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) { + get_nfs_open_context(l_ctx->open_context); + nfs_get_lock_context(l_ctx->open_context); + } +} + +static bool nfs3_nlm_unlock_prepare(struct rpc_task *task, void *data) +{ + struct nfs_lock_context *l_ctx = data; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) + return nfs_async_iocounter_wait(task, l_ctx); + return false; + +} + +static void nfs3_nlm_release_call(void *data) +{ + struct nfs_lock_context *l_ctx = data; + struct nfs_open_context *ctx; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) { + ctx = l_ctx->open_context; + nfs_put_lock_context(l_ctx); + put_nfs_open_context(ctx); + } +} + +const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { + .nlmclnt_alloc_call = nfs3_nlm_alloc_call, + .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare, + .nlmclnt_release_call = nfs3_nlm_release_call, +}; + static int nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); + struct nfs_lock_context *l_ctx = NULL; + struct nfs_open_context *ctx = nfs_file_open_context(filp); + int status; - return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); + if (fl->fl_flags & FL_CLOSE) { + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) + l_ctx = NULL; + else + set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); + } + + status = nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, l_ctx); + + if (l_ctx) + nfs_put_lock_context(l_ctx); + + return status; } static int nfs3_have_delegation(struct inode *inode, fmode_t flags) @@ -921,6 +972,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, .file_ops = &nfs_file_operations, + .nlmclnt_ops = &nlmclnt_fl_close_lock_ops, .getroot = nfs3_proc_get_root, .submount = nfs_submount, .try_mount = nfs_try_mount, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 1e486c73ec94..319a47db218d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -167,23 +167,29 @@ static ssize_t _nfs42_proc_copy(struct file *src, if (status) return status; + res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); + if (!res->commit_res.verf) + return -ENOMEM; status = nfs4_call_sync(server->client, server, &msg, &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_COPY; if (status) - return status; + goto out; - if (res->write_res.verifier.committed != NFS_FILE_SYNC) { - status = nfs_commit_file(dst, &res->write_res.verifier.verifier); - if (status) - return status; + if (nfs_write_verifier_cmp(&res->write_res.verifier.verifier, + &res->commit_res.verf->verifier)) { + status = -EAGAIN; + goto out; } truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); - return res->write_res.count; + status = res->write_res.count; +out: + kfree(res->commit_res.verf); + return status; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, @@ -240,6 +246,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (err == -ENOTSUPP) { err = -EOPNOTSUPP; break; + } if (err == -EAGAIN) { + dst_exception.retry = 1; + continue; } err2 = nfs4_handle_exception(server, err, &src_exception); @@ -379,6 +388,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); + nfs_commit_inode(inode, 0); } else spin_unlock(&inode->i_lock); break; @@ -400,8 +410,6 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; } - - dprintk("%s server returns %d\n", __func__, task->tk_status); } static void diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6c7296454bbc..528362f69cc1 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -66,12 +66,14 @@ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ - encode_copy_maxsz) + encode_copy_maxsz + \ + encode_commit_maxsz) #define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ - decode_copy_maxsz) + decode_copy_maxsz + \ + decode_commit_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ @@ -222,6 +224,18 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_nops(&hdr); } +static void encode_copy_commit(struct xdr_stream *xdr, + struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->dst_pos); + *p = cpu_to_be32(args->count); +} + /* * Encode COPY request */ @@ -239,6 +253,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req, encode_savefh(xdr, &hdr); encode_putfh(xdr, args->dst_fh, &hdr); encode_copy(xdr, args, &hdr); + encode_copy_commit(xdr, args, &hdr); encode_nops(&hdr); } @@ -481,6 +496,9 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, if (status) goto out; status = decode_copy(xdr, res); + if (status) + goto out; + status = decode_commit(xdr, &res->commit_res); out: return status; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8346ccbf2d52..66776f022111 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -359,11 +359,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, struct nfs_client *old; int error; - if (clp->cl_cons_state == NFS_CS_READY) { + if (clp->cl_cons_state == NFS_CS_READY) /* the client is initialised already */ - dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); return clp; - } /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; @@ -421,7 +419,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error: nfs_mark_client_ready(clp, error); nfs_put_client(clp); - dprintk("<-- nfs4_init_client() = xerror %d\n", error); return ERR_PTR(error); } @@ -469,6 +466,50 @@ static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2) return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0; } +static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new, + struct nfs_client **prev, struct nfs_net *nn) +{ + int status; + + if (pos->rpc_ops != new->rpc_ops) + return 1; + + if (pos->cl_minorversion != new->cl_minorversion) + return 1; + + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos", especially the client + * ID and serverowner fields. Wait for CREATE_SESSION + * to finish. */ + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + nfs_put_client(*prev); + *prev = pos; + + status = nfs_wait_client_init_complete(pos); + spin_lock(&nn->nfs_client_lock); + + if (status < 0) + return status; + } + + if (pos->cl_cons_state != NFS_CS_READY) + return 1; + + if (pos->cl_clientid != new->cl_clientid) + return 1; + + /* NFSv4.1 always uses the uniform string, however someone + * might switch the uniquifier string on us. + */ + if (!nfs4_match_client_owner_id(pos, new)) + return 1; + + return 0; +} + /** * nfs40_walk_client_list - Find server that recognizes a client ID * @@ -497,34 +538,10 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - - /* If "pos" isn't marked ready, we can't trust the - * remaining fields in "pos" */ - if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); - - nfs_put_client(prev); - prev = pos; - - status = nfs_wait_client_init_complete(pos); - if (status < 0) - goto out; - status = -NFS4ERR_STALE_CLIENTID; - spin_lock(&nn->nfs_client_lock); - } - if (pos->cl_cons_state != NFS_CS_READY) - continue; - - if (pos->cl_clientid != new->cl_clientid) - continue; - - if (!nfs4_match_client_owner_id(pos, new)) + status = nfs4_match_client(pos, new, &prev, nn); + if (status < 0) + goto out_unlock; + if (status != 0) continue; /* * We just sent a new SETCLIENTID, which should have @@ -557,8 +574,6 @@ int nfs40_walk_client_list(struct nfs_client *new, prev = NULL; *result = pos; - dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", - __func__, pos, atomic_read(&pos->cl_count)); goto out; case -ERESTARTSYS: case -ETIMEDOUT: @@ -572,32 +587,17 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); } +out_unlock: spin_unlock(&nn->nfs_client_lock); /* No match found. The server lost our clientid */ out: nfs_put_client(prev); - dprintk("NFS: <-- %s status = %d\n", __func__, status); return status; } #ifdef CONFIG_NFS_V4_1 /* - * Returns true if the client IDs match - */ -static bool nfs4_match_clientids(u64 a, u64 b) -{ - if (a != b) { - dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a, b); - return false; - } - dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a, b); - return true; -} - -/* * Returns true if the server major ids match */ static bool @@ -605,36 +605,8 @@ nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, struct nfs41_server_owner *o2) { if (o1->major_id_sz != o2->major_id_sz) - goto out_major_mismatch; - if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) - goto out_major_mismatch; - - dprintk("NFS: --> %s server owner major IDs match\n", __func__); - return true; - -out_major_mismatch: - dprintk("NFS: --> %s server owner major IDs do not match\n", - __func__); - return false; -} - -/* - * Returns true if server minor ids match - */ -static bool -nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1, - struct nfs41_server_owner *o2) -{ - /* Check eir_server_owner so_minor_id */ - if (o1->minor_id != o2->minor_id) - goto out_minor_mismatch; - - dprintk("NFS: --> %s server owner minor IDs match\n", __func__); - return true; - -out_minor_mismatch: - dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__); - return false; + return false; + return memcmp(o1->major_id, o2->major_id, o1->major_id_sz) == 0; } /* @@ -645,18 +617,9 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1, struct nfs41_server_scope *s2) { if (s1->server_scope_sz != s2->server_scope_sz) - goto out_scope_mismatch; - if (memcmp(s1->server_scope, s2->server_scope, - s1->server_scope_sz) != 0) - goto out_scope_mismatch; - - dprintk("NFS: --> %s server scopes match\n", __func__); - return true; - -out_scope_mismatch: - dprintk("NFS: --> %s server scopes do not match\n", - __func__); - return false; + return false; + return memcmp(s1->server_scope, s2->server_scope, + s1->server_scope_sz) == 0; } /** @@ -680,7 +643,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, struct rpc_xprt *xprt) { /* Check eir_clientid */ - if (!nfs4_match_clientids(clp->cl_clientid, res->clientid)) + if (clp->cl_clientid != res->clientid) goto out_err; /* Check eir_server_owner so_major_id */ @@ -689,8 +652,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, goto out_err; /* Check eir_server_owner so_minor_id */ - if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner, - res->server_owner)) + if (clp->cl_serverowner->minor_id != res->server_owner->minor_id) goto out_err; /* Check eir_server_scope */ @@ -739,33 +701,10 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos == new) goto found; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - - /* If "pos" isn't marked ready, we can't trust the - * remaining fields in "pos", especially the client - * ID and serverowner fields. Wait for CREATE_SESSION - * to finish. */ - if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); - - nfs_put_client(prev); - prev = pos; - - status = nfs_wait_client_init_complete(pos); - spin_lock(&nn->nfs_client_lock); - if (status < 0) - break; - status = -NFS4ERR_STALE_CLIENTID; - } - if (pos->cl_cons_state != NFS_CS_READY) - continue; - - if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid)) + status = nfs4_match_client(pos, new, &prev, nn); + if (status < 0) + goto out; + if (status != 0) continue; /* @@ -777,23 +716,15 @@ int nfs41_walk_client_list(struct nfs_client *new, new->cl_serverowner)) continue; - /* Unlike NFSv4.0, we know that NFSv4.1 always uses the - * uniform string, however someone might switch the - * uniquifier string on us. - */ - if (!nfs4_match_client_owner_id(pos, new)) - continue; found: atomic_inc(&pos->cl_count); *result = pos; status = 0; - dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", - __func__, pos, atomic_read(&pos->cl_count)); break; } +out: spin_unlock(&nn->nfs_client_lock); - dprintk("NFS: <-- %s status = %d\n", __func__, status); nfs_put_client(prev); return status; } @@ -916,9 +847,6 @@ static int nfs4_set_client(struct nfs_server *server, .timeparms = timeparms, }; struct nfs_client *clp; - int error; - - dprintk("--> nfs4_set_client()\n"); if (server->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); @@ -927,15 +855,11 @@ static int nfs4_set_client(struct nfs_server *server, /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - error = PTR_ERR(clp); - goto error; - } + if (IS_ERR(clp)) + return PTR_ERR(clp); - if (server->nfs_client == clp) { - error = -ELOOP; - goto error; - } + if (server->nfs_client == clp) + return -ELOOP; /* * Query for the lease time on clientid setup or renewal @@ -947,11 +871,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); server->nfs_client = clp; - dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); return 0; -error: - dprintk("<-- nfs4_set_client() = xerror %d\n", error); - return error; } /* @@ -982,7 +902,6 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, }; - struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) @@ -998,10 +917,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init); - - dprintk("<-- %s %p\n", __func__, clp); - return clp; + return nfs_get_client(&cl_init); } EXPORT_SYMBOL_GPL(nfs4_set_ds_client); @@ -1098,8 +1014,6 @@ static int nfs4_init_server(struct nfs_server *server, struct rpc_timeout timeparms; int error; - dprintk("--> nfs4_init_server()\n"); - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); @@ -1127,7 +1041,7 @@ static int nfs4_init_server(struct nfs_server *server, data->minorversion, data->net); if (error < 0) - goto error; + return error; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -1138,16 +1052,10 @@ static int nfs4_init_server(struct nfs_server *server, server->acregmax = data->acregmax * HZ; server->acdirmin = data->acdirmin * HZ; server->acdirmax = data->acdirmax * HZ; + server->port = data->nfs_server.port; - server->port = data->nfs_server.port; - - error = nfs_init_server_rpcclient(server, &timeparms, - data->selected_flavor); - -error: - /* Done */ - dprintk("<-- nfs4_init_server() = %d\n", error); - return error; + return nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); } /* @@ -1163,8 +1071,6 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, bool auth_probe; int error; - dprintk("--> nfs4_create_server()\n"); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1180,12 +1086,10 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (error < 0) goto error; - dprintk("<-- nfs4_create_server() = %p\n", server); return server; error: nfs_free_server(server); - dprintk("<-- nfs4_create_server() = error %d\n", error); return ERR_PTR(error); } @@ -1200,8 +1104,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, bool auth_probe; int error; - dprintk("--> nfs4_create_referral_server()\n"); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1235,12 +1137,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - dprintk("<-- nfs_create_referral_server() = %p\n", server); return server; error: nfs_free_server(server); - dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); } @@ -1300,31 +1200,16 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, struct sockaddr *localaddr = (struct sockaddr *)&address; int error; - dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, - (unsigned long long)server->fsid.major, - (unsigned long long)server->fsid.minor, - hostname); - error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); - if (error != 0) { - dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", - __func__, error); - goto out; - } + if (error != 0) + return error; error = rpc_localaddr(clnt, localaddr, sizeof(address)); - if (error != 0) { - dprintk("<-- %s(): rpc_localaddr returned %d\n", - __func__, error); - goto out; - } + if (error != 0) + return error; - error = -EAFNOSUPPORT; - if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { - dprintk("<-- %s(): rpc_ntop returned %d\n", - __func__, error); - goto out; - } + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) + return -EAFNOSUPPORT; nfs_server_remove_lists(server); error = nfs4_set_client(server, hostname, sap, salen, buf, @@ -1333,21 +1218,12 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); - dprintk("<-- %s(): nfs4_set_client returned %d\n", - __func__, error); - goto out; + return error; } if (server->nfs_client->cl_hostname == NULL) server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); nfs_server_insert_lists(server); - error = nfs_probe_destination(server); - if (error < 0) - goto out; - - dprintk("<-- %s() succeeded\n", __func__); - -out: - return error; + return nfs_probe_destination(server); } diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index 039b3eb6d834..ac8406018962 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -14,8 +14,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p struct nfs_fsinfo fsinfo; int ret = -ENOMEM; - dprintk("--> nfs4_get_rootfh()\n"); - fsinfo.fattr = nfs_alloc_fattr(); if (fsinfo.fattr == NULL) goto out; @@ -38,6 +36,5 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); out: nfs_free_fattr(fsinfo.fattr); - dprintk("<-- nfs4_get_rootfh() = %d\n", ret); return ret; } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index d8b040bd9814..7d531da1bae3 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -340,7 +340,6 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, out: free_page((unsigned long) page); free_page((unsigned long) page2); - dprintk("%s: done\n", __func__); return mnt; } @@ -358,11 +357,9 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * int err; /* BUG_ON(IS_ROOT(dentry)); */ - dprintk("%s: enter\n", __func__); - page = alloc_page(GFP_KERNEL); if (page == NULL) - goto out; + return mnt; fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (fs_locations == NULL) @@ -386,8 +383,6 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * out_free: __free_page(page); kfree(fs_locations); -out: - dprintk("%s: done\n", __func__); return mnt; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 201ca3f2c4ba..98b0b662af09 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -698,7 +698,8 @@ static int nfs41_sequence_process(struct rpc_task *task, session = slot->table->session; if (slot->interrupted) { - slot->interrupted = 0; + if (res->sr_status != -NFS4ERR_DELAY) + slot->interrupted = 0; interrupted = true; } @@ -2300,8 +2301,10 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) if (status != 0) return status; } - if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) + if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + nfs4_sequence_free_slot(&o_res->seq_res); nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + } return 0; } @@ -2586,7 +2589,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, /* Except MODE, it seems harmless of setting twice. */ if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - attrset[1] & FATTR4_WORD1_MODE) + (attrset[1] & FATTR4_WORD1_MODE || + attrset[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_valid &= ~ATTR_MODE; if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) @@ -3265,6 +3269,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f .rpc_resp = &res, }; int status; + int i; bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_FH_EXPIRE_TYPE | @@ -3330,8 +3335,13 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->cache_consistency_bitmask[2] = 0; + + /* Avoid a regression due to buggy server */ + for (i = 0; i < ARRAY_SIZE(res.exclcreat_bitmask); i++) + res.exclcreat_bitmask[i] &= res.attr_bitmask[i]; memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask, sizeof(server->exclcreat_bitmask)); + server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -4610,7 +4620,7 @@ static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, return 0; if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, - hdr->rw_ops->rw_mode) == -EIO) + hdr->rw_mode) == -EIO) return -EIO; if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) return -EIO; @@ -4804,8 +4814,10 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, if (!atomic_inc_not_zero(&clp->cl_count)) return -EIO; data = kmalloc(sizeof(*data), GFP_NOFS); - if (data == NULL) + if (data == NULL) { + nfs_put_client(clp); return -ENOMEM; + } data->client = clp; data->timestamp = jiffies; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, @@ -5782,6 +5794,7 @@ struct nfs4_unlockdata { struct nfs_locku_res res; struct nfs4_lock_state *lsp; struct nfs_open_context *ctx; + struct nfs_lock_context *l_ctx; struct file_lock fl; struct nfs_server *server; unsigned long timestamp; @@ -5806,6 +5819,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, atomic_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); + p->l_ctx = nfs_get_lock_context(ctx); memcpy(&p->fl, fl, sizeof(p->fl)); p->server = NFS_SERVER(inode); return p; @@ -5816,6 +5830,7 @@ static void nfs4_locku_release_calldata(void *data) struct nfs4_unlockdata *calldata = data; nfs_free_seqid(calldata->arg.seqid); nfs4_put_lock_state(calldata->lsp); + nfs_put_lock_context(calldata->l_ctx); put_nfs_open_context(calldata->ctx); kfree(calldata); } @@ -5857,6 +5872,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + if (test_bit(NFS_CONTEXT_UNLOCK, &calldata->l_ctx->open_context->flags) && + nfs_async_iocounter_wait(task, calldata->l_ctx)) + return; + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); @@ -5908,6 +5927,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, * canceled lock is passed in, and it won't be an unlock. */ fl->fl_type = F_UNLCK; + if (fl->fl_flags & FL_CLOSE) + set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); if (data == NULL) { @@ -6352,7 +6373,7 @@ struct nfs4_lock_waiter { }; static int -nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { int ret; struct cb_notify_lock_args *cbnl = key; @@ -6395,7 +6416,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) .inode = state->inode, .owner = &owner, .notified = false }; - wait_queue_t wait; + wait_queue_entry_t wait; /* Don't bother with waitqueue if we don't expect a callback */ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) @@ -6445,9 +6466,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) ctx = nfs_file_open_context(filp); state = ctx->state; - if (request->fl_start < 0 || request->fl_end < 0) - return -EINVAL; - if (IS_GETLK(cmd)) { if (state != NULL) return nfs4_proc_getlk(state, F_GETLK, request); @@ -6470,20 +6488,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; - /* - * Don't rely on the VFS having checked the file open mode, - * since it won't do this for flock() locks. - */ - switch (request->fl_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - } - status = nfs4_set_lock_state(state, request); if (status != 0) return status; @@ -7155,8 +7159,6 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, }; struct rpc_task *task; - dprintk("--> %s\n", __func__); - nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id); if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) args.dir = NFS4_CDFC4_FORE; @@ -7176,24 +7178,20 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, if (memcmp(res.sessionid.data, clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s: Session ID mismatch\n", __func__); - status = -EIO; - goto out; + return -EIO; } if ((res.dir & args.dir) != res.dir || res.dir == 0) { dprintk("NFS: %s: Unexpected direction from server\n", __func__); - status = -EIO; - goto out; + return -EIO; } if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) { dprintk("NFS: %s: Server returned RDMA mode = true\n", __func__); - status = -EIO; - goto out; + return -EIO; } } -out: - dprintk("<-- %s status= %d\n", __func__, status); + return status; } @@ -7459,15 +7457,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, }; struct nfs41_exchange_id_data *calldata; struct rpc_task *task; - int status = -EIO; + int status; if (!atomic_inc_not_zero(&clp->cl_count)) - goto out; + return -EIO; - status = -ENOMEM; calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (!calldata) - goto out; + if (!calldata) { + nfs_put_client(clp); + return -ENOMEM; + } if (!xprt) nfs4_init_boot_verifier(clp, &verifier); @@ -7476,10 +7475,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (status) goto out_calldata; - dprintk("NFS call exchange_id auth=%s, '%s'\n", - clp->cl_rpcclient->cl_auth->au_ops->au_name, - clp->cl_owner_id); - calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); status = -ENOMEM; @@ -7545,13 +7540,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, rpc_put_task(task); out: - if (clp->cl_implid != NULL) - dprintk("NFS reply exchange_id: Server Implementation ID: " - "domain: %s, name: %s, date: %llu,%u\n", - clp->cl_implid->domain, clp->cl_implid->name, - clp->cl_implid->date.seconds, - clp->cl_implid->date.nseconds); - dprintk("NFS reply exchange_id: %d\n", status); return status; out_impl_id: @@ -7769,17 +7757,13 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); nfs4_set_sequence_privileged(&args.la_seq_args); - dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); if (IS_ERR(task)) - status = PTR_ERR(task); - else { - status = task->tk_status; - rpc_put_task(task); - } - dprintk("<-- %s return %d\n", __func__, status); + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); return status; } @@ -8180,6 +8164,12 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf /* fall through */ case -NFS4ERR_RETRY_UNCACHED_REP: return -EAGAIN; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + nfs4_schedule_session_recovery(clp->cl_session, + task->tk_status); + break; default: nfs4_schedule_lease_recovery(clp); } @@ -8258,7 +8248,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, if (status == 0) status = task->tk_status; rpc_put_task(task); - return 0; out: dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -8357,6 +8346,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, */ pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); pnfs_free_lseg_list(&head); status = -EAGAIN; goto out; @@ -8427,6 +8417,7 @@ static void nfs4_layoutget_release(void *calldata) size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); + nfs4_sequence_free_slot(&lgp->res.seq_res); nfs4_free_pages(lgp->args.layout.pages, max_pages); pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); @@ -8501,7 +8492,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); - nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 8156bad6b441..cbf82b0d4467 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1649,13 +1649,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); } -static void nfs4_reclaim_complete(struct nfs_client *clp, +static int nfs4_reclaim_complete(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops, struct rpc_cred *cred) { /* Notify the server we're done reclaiming our state */ if (ops->reclaim_complete) - (void)ops->reclaim_complete(clp, cred); + return ops->reclaim_complete(clp, cred); + return 0; } static void nfs4_clear_reclaim_server(struct nfs_server *server) @@ -1702,13 +1703,16 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { const struct nfs4_state_recovery_ops *ops; struct rpc_cred *cred; + int err; if (!nfs4_state_clear_reclaim_reboot(clp)) return; ops = clp->cl_mvops->reboot_recovery_ops; cred = nfs4_get_clid_cred(clp); - nfs4_reclaim_complete(clp, ops, cred); + err = nfs4_reclaim_complete(clp, ops, cred); put_rpccred(cred); + if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION) + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); } static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) @@ -2130,6 +2134,8 @@ again: put_rpccred(cred); switch (status) { case 0: + case -EINTR: + case -ERESTARTSYS: break; case -ETIMEDOUT: if (clnt->cl_softrtry) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 80ce289eea05..3aebfdc82b30 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1000,8 +1000,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, + const umode_t *umask, const struct nfs_server *server, - bool excl_check, const umode_t *umask) + const uint32_t attrmask[]) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1016,22 +1017,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, /* * We reserve enough space to write the entire attribute buffer at once. */ - if (iap->ia_valid & ATTR_SIZE) { + if ((iap->ia_valid & ATTR_SIZE) && (attrmask[0] & FATTR4_WORD0_SIZE)) { bmval[0] |= FATTR4_WORD0_SIZE; len += 8; } - if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) - umask = NULL; if (iap->ia_valid & ATTR_MODE) { - if (umask) { + if (umask && (attrmask[2] & FATTR4_WORD2_MODE_UMASK)) { bmval[2] |= FATTR4_WORD2_MODE_UMASK; len += 8; - } else { + } else if (attrmask[1] & FATTR4_WORD1_MODE) { bmval[1] |= FATTR4_WORD1_MODE; len += 4; } } - if (iap->ia_valid & ATTR_UID) { + if ((iap->ia_valid & ATTR_UID) && (attrmask[1] & FATTR4_WORD1_OWNER)) { owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", @@ -1044,7 +1043,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_OWNER; len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } - if (iap->ia_valid & ATTR_GID) { + if ((iap->ia_valid & ATTR_GID) && + (attrmask[1] & FATTR4_WORD1_OWNER_GROUP)) { owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", @@ -1056,32 +1056,26 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_OWNER_GROUP; len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } - if (iap->ia_valid & ATTR_ATIME_SET) { - bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 16; - } else if (iap->ia_valid & ATTR_ATIME) { - bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 4; - } - if (iap->ia_valid & ATTR_MTIME_SET) { - bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 16; - } else if (iap->ia_valid & ATTR_MTIME) { - bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 4; + if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + if (iap->ia_valid & ATTR_ATIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 16; + } else if (iap->ia_valid & ATTR_ATIME) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 4; + } } - - if (excl_check) { - const u32 *excl_bmval = server->exclcreat_bitmask; - bmval[0] &= excl_bmval[0]; - bmval[1] &= excl_bmval[1]; - bmval[2] &= excl_bmval[2]; - - if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) - label = NULL; + if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + if (iap->ia_valid & ATTR_MTIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 16; + } else if (iap->ia_valid & ATTR_MTIME) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 4; + } } - if (label) { + if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) { len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } @@ -1188,8 +1182,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server, false, - &create->umask); + encode_attrs(xdr, create->attrs, create->label, &create->umask, + create->server, create->server->attr_bitmask); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1409,13 +1403,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->attr_bitmask); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->attr_bitmask); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1424,8 +1418,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->exclcreat_bitmask); } } @@ -1681,7 +1675,8 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server, false, NULL); + encode_attrs(xdr, arg->iap, arg->label, NULL, server, + server->attr_bitmask); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -2005,16 +2000,10 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ - if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { - NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( - NFS_I(inode)->layout, xdr, args); - } else { - encode_uint32(xdr, args->layoutupdate_len); - if (args->layoutupdate_pages) { - xdr_write_pages(xdr, args->layoutupdate_pages, 0, - args->layoutupdate_len); - } - } + encode_uint32(xdr, args->layoutupdate_len); + if (args->layoutupdate_pages) + xdr_write_pages(xdr, args->layoutupdate_pages, 0, + args->layoutupdate_len); return 0; } @@ -2024,7 +2013,6 @@ encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args, struct compound_hdr *hdr) { - const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld; __be32 *p; encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); @@ -2041,8 +2029,6 @@ encode_layoutreturn(struct xdr_stream *xdr, spin_unlock(&args->inode->i_lock); if (args->ld_private->ops && args->ld_private->ops->encode) args->ld_private->ops->encode(xdr, args, args->ld_private); - else if (lr_ops->encode_layoutreturn) - lr_ops->encode_layoutreturn(xdr, args); else encode_uint32(xdr, 0); } @@ -5579,6 +5565,8 @@ static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) unsigned int i; p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; bitmap_words = be32_to_cpup(p++); if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) return -EIO; diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild deleted file mode 100644 index ed30ea072bb8..000000000000 --- a/fs/nfs/objlayout/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -# -# Makefile for the pNFS Objects Layout Driver kernel module -# -objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o -obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c deleted file mode 100644 index 049c1b1f2932..000000000000 --- a/fs/nfs/objlayout/objio_osd.c +++ /dev/null @@ -1,675 +0,0 @@ -/* - * pNFS Objects layout implementation over open-osd initiator library - * - * Copyright (C) 2009 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/module.h> -#include <scsi/osd_ore.h> - -#include "objlayout.h" -#include "../internal.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -struct objio_dev_ent { - struct nfs4_deviceid_node id_node; - struct ore_dev od; -}; - -static void -objio_free_deviceid_node(struct nfs4_deviceid_node *d) -{ - struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - - dprintk("%s: free od=%p\n", __func__, de->od.od); - osduld_put_device(de->od.od); - kfree_rcu(d, rcu); -} - -struct objio_segment { - struct pnfs_layout_segment lseg; - - struct ore_layout layout; - struct ore_components oc; -}; - -static inline struct objio_segment * -OBJIO_LSEG(struct pnfs_layout_segment *lseg) -{ - return container_of(lseg, struct objio_segment, lseg); -} - -struct objio_state { - /* Generic layer */ - struct objlayout_io_res oir; - - bool sync; - /*FIXME: Support for extra_bytes at ore_get_rw_state() */ - struct ore_io_state *ios; -}; - -/* Send and wait for a get_device_info of devices in the layout, - then look them up with the osd_initiator library */ -struct nfs4_deviceid_node * -objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, - gfp_t gfp_flags) -{ - struct pnfs_osd_deviceaddr *deviceaddr; - struct objio_dev_ent *ode = NULL; - struct osd_dev *od; - struct osd_dev_info odi; - bool retry_flag = true; - __be32 *p; - int err; - - deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags); - if (!deviceaddr) - return NULL; - - p = page_address(pdev->pages[0]); - pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p); - - odi.systemid_len = deviceaddr->oda_systemid.len; - if (odi.systemid_len > sizeof(odi.systemid)) { - dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", - __func__, sizeof(odi.systemid)); - err = -EINVAL; - goto out; - } else if (odi.systemid_len) - memcpy(odi.systemid, deviceaddr->oda_systemid.data, - odi.systemid_len); - odi.osdname_len = deviceaddr->oda_osdname.len; - odi.osdname = (u8 *)deviceaddr->oda_osdname.data; - - if (!odi.osdname_len && !odi.systemid_len) { - dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", - __func__); - err = -ENODEV; - goto out; - } - -retry_lookup: - od = osduld_info_lookup(&odi); - if (IS_ERR(od)) { - err = PTR_ERR(od); - dprintk("%s: osduld_info_lookup => %d\n", __func__, err); - if (err == -ENODEV && retry_flag) { - err = objlayout_autologin(deviceaddr); - if (likely(!err)) { - retry_flag = false; - goto retry_lookup; - } - } - goto out; - } - - dprintk("Adding new dev_id(%llx:%llx)\n", - _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id)); - - ode = kzalloc(sizeof(*ode), gfp_flags); - if (!ode) { - dprintk("%s: -ENOMEM od=%p\n", __func__, od); - goto out; - } - - nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id); - kfree(deviceaddr); - - ode->od.od = od; - return &ode->id_node; - -out: - kfree(deviceaddr); - return NULL; -} - -static void copy_single_comp(struct ore_components *oc, unsigned c, - struct pnfs_osd_object_cred *src_comp) -{ - struct ore_comp *ocomp = &oc->comps[c]; - - WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ - WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - - ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; - ocomp->obj.id = src_comp->oc_object_id.oid_object_id; - - memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); -} - -static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, - struct objio_segment **pseg) -{ -/* This is the in memory structure of the objio_segment - * - * struct __alloc_objio_segment { - * struct objio_segment olseg; - * struct ore_dev *ods[numdevs]; - * struct ore_comp comps[numdevs]; - * } *aolseg; - * NOTE: The code as above compiles and runs perfectly. It is elegant, - * type safe and compact. At some Past time Linus has decided he does not - * like variable length arrays, For the sake of this principal we uglify - * the code as below. - */ - struct objio_segment *lseg; - size_t lseg_size = sizeof(*lseg) + - numdevs * sizeof(lseg->oc.ods[0]) + - numdevs * sizeof(*lseg->oc.comps); - - lseg = kzalloc(lseg_size, gfp_flags); - if (unlikely(!lseg)) { - dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, - numdevs, lseg_size); - return -ENOMEM; - } - - lseg->oc.numdevs = numdevs; - lseg->oc.single_comp = EC_MULTPLE_COMPS; - lseg->oc.ods = (void *)(lseg + 1); - lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); - - *pseg = lseg; - return 0; -} - -int objio_alloc_lseg(struct pnfs_layout_segment **outp, - struct pnfs_layout_hdr *pnfslay, - struct pnfs_layout_range *range, - struct xdr_stream *xdr, - gfp_t gfp_flags) -{ - struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode); - struct objio_segment *objio_seg; - struct pnfs_osd_xdr_decode_layout_iter iter; - struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred src_comp; - unsigned cur_comp; - int err; - - err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); - if (unlikely(err)) - return err; - - err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); - if (unlikely(err)) - return err; - - objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; - objio_seg->layout.group_width = layout.olo_map.odm_group_width; - objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; - objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - - err = ore_verify_layout(layout.olo_map.odm_num_comps, - &objio_seg->layout); - if (unlikely(err)) - goto err; - - objio_seg->oc.first_dev = layout.olo_comps_index; - cur_comp = 0; - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { - struct nfs4_deviceid_node *d; - struct objio_dev_ent *ode; - - copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); - - d = nfs4_find_get_deviceid(server, - &src_comp.oc_object_id.oid_device_id, - pnfslay->plh_lc_cred, gfp_flags); - if (!d) { - err = -ENXIO; - goto err; - } - - ode = container_of(d, struct objio_dev_ent, id_node); - objio_seg->oc.ods[cur_comp++] = &ode->od; - } - /* pnfs_osd_xdr_decode_layout_comp returns false on error */ - if (unlikely(err)) - goto err; - - *outp = &objio_seg->lseg; - return 0; - -err: - kfree(objio_seg); - dprintk("%s: Error: return %d\n", __func__, err); - *outp = NULL; - return err; -} - -void objio_free_lseg(struct pnfs_layout_segment *lseg) -{ - int i; - struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - - for (i = 0; i < objio_seg->oc.numdevs; i++) { - struct ore_dev *od = objio_seg->oc.ods[i]; - struct objio_dev_ent *ode; - - if (!od) - break; - ode = container_of(od, typeof(*ode), od); - nfs4_put_deviceid_node(&ode->id_node); - } - kfree(objio_seg); -} - -static int -objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, - struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, - loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, - struct objio_state **outp) -{ - struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct ore_io_state *ios; - int ret; - struct __alloc_objio_state { - struct objio_state objios; - struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; - } *aos; - - aos = kzalloc(sizeof(*aos), gfp_flags); - if (unlikely(!aos)) - return -ENOMEM; - - objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, - aos->ioerrs, rpcdata, pnfs_layout_type); - - ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, - offset, count, &ios); - if (unlikely(ret)) { - kfree(aos); - return ret; - } - - ios->pages = pages; - ios->pgbase = pgbase; - ios->private = aos; - BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); - - aos->objios.sync = 0; - aos->objios.ios = ios; - *outp = &aos->objios; - return 0; -} - -void objio_free_result(struct objlayout_io_res *oir) -{ - struct objio_state *objios = container_of(oir, struct objio_state, oir); - - ore_put_io_state(objios->ios); - kfree(objios); -} - -static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) -{ - switch (oep) { - case OSD_ERR_PRI_NO_ERROR: - return (enum pnfs_osd_errno)0; - - case OSD_ERR_PRI_CLEAR_PAGES: - BUG_ON(1); - return 0; - - case OSD_ERR_PRI_RESOURCE: - return PNFS_OSD_ERR_RESOURCE; - case OSD_ERR_PRI_BAD_CRED: - return PNFS_OSD_ERR_BAD_CRED; - case OSD_ERR_PRI_NO_ACCESS: - return PNFS_OSD_ERR_NO_ACCESS; - case OSD_ERR_PRI_UNREACHABLE: - return PNFS_OSD_ERR_UNREACHABLE; - case OSD_ERR_PRI_NOT_FOUND: - return PNFS_OSD_ERR_NOT_FOUND; - case OSD_ERR_PRI_NO_SPACE: - return PNFS_OSD_ERR_NO_SPACE; - default: - WARN_ON(1); - /* fallthrough */ - case OSD_ERR_PRI_EIO: - return PNFS_OSD_ERR_EIO; - } -} - -static void __on_dev_error(struct ore_io_state *ios, - struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, - u64 dev_offset, u64 dev_len) -{ - struct objio_state *objios = ios->private; - struct pnfs_osd_objid pooid; - struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); - /* FIXME: what to do with more-then-one-group layouts. We need to - * translate from ore_io_state index to oc->comps index - */ - unsigned comp = dev_index; - - pooid.oid_device_id = ode->id_node.deviceid; - pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; - pooid.oid_object_id = ios->oc->comps[comp].obj.id; - - objlayout_io_set_result(&objios->oir, comp, - &pooid, osd_pri_2_pnfs_err(oep), - dev_offset, dev_len, !ios->reading); -} - -/* - * read - */ -static void _read_done(struct ore_io_state *ios, void *private) -{ - struct objio_state *objios = private; - ssize_t status; - int ret = ore_check_io(ios, &__on_dev_error); - - /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ - - if (likely(!ret)) - status = ios->length; - else - status = ret; - - objlayout_read_done(&objios->oir, status, objios->sync); -} - -int objio_read_pagelist(struct nfs_pgio_header *hdr) -{ - struct objio_state *objios; - int ret; - - ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, - hdr->lseg, hdr->args.pages, hdr->args.pgbase, - hdr->args.offset, hdr->args.count, hdr, - GFP_KERNEL, &objios); - if (unlikely(ret)) - return ret; - - objios->ios->done = _read_done; - dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - hdr->args.offset, hdr->args.count); - ret = ore_read(objios->ios); - if (unlikely(ret)) - objio_free_result(&objios->oir); - return ret; -} - -/* - * write - */ -static void _write_done(struct ore_io_state *ios, void *private) -{ - struct objio_state *objios = private; - ssize_t status; - int ret = ore_check_io(ios, &__on_dev_error); - - /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ - - if (likely(!ret)) { - /* FIXME: should be based on the OSD's persistence model - * See OSD2r05 Section 4.13 Data persistence model */ - objios->oir.committed = NFS_FILE_SYNC; - status = ios->length; - } else { - status = ret; - } - - objlayout_write_done(&objios->oir, status, objios->sync); -} - -static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) -{ - struct objio_state *objios = priv; - struct nfs_pgio_header *hdr = objios->oir.rpcdata; - struct address_space *mapping = hdr->inode->i_mapping; - pgoff_t index = offset / PAGE_SIZE; - struct page *page; - loff_t i_size = i_size_read(hdr->inode); - - if (offset >= i_size) { - *uptodate = true; - dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); - return ZERO_PAGE(0); - } - - page = find_get_page(mapping, index); - if (!page) { - page = find_or_create_page(mapping, index, GFP_NOFS); - if (unlikely(!page)) { - dprintk("%s: grab_cache_page Failed index=0x%lx\n", - __func__, index); - return NULL; - } - unlock_page(page); - } - *uptodate = PageUptodate(page); - dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); - return page; -} - -static void __r4w_put_page(void *priv, struct page *page) -{ - dprintk("%s: index=0x%lx\n", __func__, - (page == ZERO_PAGE(0)) ? -1UL : page->index); - if (ZERO_PAGE(0) != page) - put_page(page); - return; -} - -static const struct _ore_r4w_op _r4w_op = { - .get_page = &__r4w_get_page, - .put_page = &__r4w_put_page, -}; - -int objio_write_pagelist(struct nfs_pgio_header *hdr, int how) -{ - struct objio_state *objios; - int ret; - - ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, - hdr->lseg, hdr->args.pages, hdr->args.pgbase, - hdr->args.offset, hdr->args.count, hdr, GFP_NOFS, - &objios); - if (unlikely(ret)) - return ret; - - objios->sync = 0 != (how & FLUSH_SYNC); - objios->ios->r4w = &_r4w_op; - - if (!objios->sync) - objios->ios->done = _write_done; - - dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - hdr->args.offset, hdr->args.count); - ret = ore_write(objios->ios); - if (unlikely(ret)) { - objio_free_result(&objios->oir); - return ret; - } - - if (objios->sync) - _write_done(objios->ios, objios); - - return 0; -} - -/* - * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number - * of bytes (maximum @req->wb_bytes) that can be coalesced. - */ -static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, - struct nfs_page *prev, struct nfs_page *req) -{ - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio); - unsigned int size; - - size = pnfs_generic_pg_test(pgio, prev, req); - - if (!size || mirror->pg_count + req->wb_bytes > - (unsigned long)pgio->pg_layout_private) - return 0; - - return min(size, req->wb_bytes); -} - -static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - pnfs_generic_pg_init_read(pgio, req); - if (unlikely(pgio->pg_lseg == NULL)) - return; /* Not pNFS */ - - pgio->pg_layout_private = (void *) - OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; -} - -static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, - unsigned long *stripe_end) -{ - u32 stripe_off; - unsigned stripe_size; - - if (layout->raid_algorithm == PNFS_OSD_RAID_0) - return true; - - stripe_size = layout->stripe_unit * - (layout->group_width - layout->parity); - - div_u64_rem(offset, stripe_size, &stripe_off); - if (!stripe_off) - return true; - - *stripe_end = stripe_size - stripe_off; - return false; -} - -static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - unsigned long stripe_end = 0; - u64 wb_size; - - if (pgio->pg_dreq == NULL) - wb_size = i_size_read(pgio->pg_inode) - req_offset(req); - else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pnfs_generic_pg_init_write(pgio, req, wb_size); - if (unlikely(pgio->pg_lseg == NULL)) - return; /* Not pNFS */ - - if (req->wb_offset || - !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE, - &OBJIO_LSEG(pgio->pg_lseg)->layout, - &stripe_end)) { - pgio->pg_layout_private = (void *)stripe_end; - } else { - pgio->pg_layout_private = (void *) - OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; - } -} - -static const struct nfs_pageio_ops objio_pg_read_ops = { - .pg_init = objio_init_read, - .pg_test = objio_pg_test, - .pg_doio = pnfs_generic_pg_readpages, - .pg_cleanup = pnfs_generic_pg_cleanup, -}; - -static const struct nfs_pageio_ops objio_pg_write_ops = { - .pg_init = objio_init_write, - .pg_test = objio_pg_test, - .pg_doio = pnfs_generic_pg_writepages, - .pg_cleanup = pnfs_generic_pg_cleanup, -}; - -static struct pnfs_layoutdriver_type objlayout_type = { - .id = LAYOUT_OSD2_OBJECTS, - .name = "LAYOUT_OSD2_OBJECTS", - .flags = PNFS_LAYOUTRET_ON_SETATTR | - PNFS_LAYOUTRET_ON_ERROR, - - .max_deviceinfo_size = PAGE_SIZE, - .owner = THIS_MODULE, - .alloc_layout_hdr = objlayout_alloc_layout_hdr, - .free_layout_hdr = objlayout_free_layout_hdr, - - .alloc_lseg = objlayout_alloc_lseg, - .free_lseg = objlayout_free_lseg, - - .read_pagelist = objlayout_read_pagelist, - .write_pagelist = objlayout_write_pagelist, - .pg_read_ops = &objio_pg_read_ops, - .pg_write_ops = &objio_pg_write_ops, - - .sync = pnfs_generic_sync, - - .free_deviceid_node = objio_free_deviceid_node, - - .encode_layoutcommit = objlayout_encode_layoutcommit, - .encode_layoutreturn = objlayout_encode_layoutreturn, -}; - -MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); -MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>"); -MODULE_LICENSE("GPL"); - -static int __init -objlayout_init(void) -{ - int ret = pnfs_register_layoutdriver(&objlayout_type); - - if (ret) - printk(KERN_INFO - "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n", - __func__, ret); - else - printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n", - __func__); - return ret; -} - -static void __exit -objlayout_exit(void) -{ - pnfs_unregister_layoutdriver(&objlayout_type); - printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n", - __func__); -} - -MODULE_ALIAS("nfs-layouttype4-2"); - -module_init(objlayout_init); -module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c deleted file mode 100644 index 8f3d2acb81c3..000000000000 --- a/fs/nfs/objlayout/objlayout.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * pNFS Objects layout driver high level definitions - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/kmod.h> -#include <linux/moduleparam.h> -#include <linux/ratelimit.h> -#include <scsi/osd_initiator.h> -#include "objlayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD -/* - * Create a objlayout layout structure for the given inode and return it. - */ -struct pnfs_layout_hdr * -objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) -{ - struct objlayout *objlay; - - objlay = kzalloc(sizeof(struct objlayout), gfp_flags); - if (!objlay) - return NULL; - spin_lock_init(&objlay->lock); - INIT_LIST_HEAD(&objlay->err_list); - dprintk("%s: Return %p\n", __func__, objlay); - return &objlay->pnfs_layout; -} - -/* - * Free an objlayout layout structure - */ -void -objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) -{ - struct objlayout *objlay = OBJLAYOUT(lo); - - dprintk("%s: objlay %p\n", __func__, objlay); - - WARN_ON(!list_empty(&objlay->err_list)); - kfree(objlay); -} - -/* - * Unmarshall layout and store it in pnfslay. - */ -struct pnfs_layout_segment * -objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, - struct nfs4_layoutget_res *lgr, - gfp_t gfp_flags) -{ - int status = -ENOMEM; - struct xdr_stream stream; - struct xdr_buf buf = { - .pages = lgr->layoutp->pages, - .page_len = lgr->layoutp->len, - .buflen = lgr->layoutp->len, - .len = lgr->layoutp->len, - }; - struct page *scratch; - struct pnfs_layout_segment *lseg; - - dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); - - scratch = alloc_page(gfp_flags); - if (!scratch) - goto err_nofree; - - xdr_init_decode(&stream, &buf, NULL); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); - - status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); - if (unlikely(status)) { - dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, - status); - goto err; - } - - __free_page(scratch); - - dprintk("%s: Return %p\n", __func__, lseg); - return lseg; - -err: - __free_page(scratch); -err_nofree: - dprintk("%s: Err Return=>%d\n", __func__, status); - return ERR_PTR(status); -} - -/* - * Free a layout segement - */ -void -objlayout_free_lseg(struct pnfs_layout_segment *lseg) -{ - dprintk("%s: freeing layout segment %p\n", __func__, lseg); - - if (unlikely(!lseg)) - return; - - objio_free_lseg(lseg); -} - -/* - * I/O Operations - */ -static inline u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end : NFS4_MAX_UINT64; -} - -static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, - struct page ***p_pages, unsigned *p_pgbase, - u64 offset, unsigned long count) -{ - u64 lseg_end_offset; - - BUG_ON(offset < lseg->pls_range.offset); - lseg_end_offset = end_offset(lseg->pls_range.offset, - lseg->pls_range.length); - BUG_ON(offset >= lseg_end_offset); - WARN_ON(offset + count > lseg_end_offset); - - if (*p_pgbase > PAGE_SIZE) { - dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); - *p_pages += *p_pgbase >> PAGE_SHIFT; - *p_pgbase &= ~PAGE_MASK; - } -} - -/* - * I/O done common code - */ -static void -objlayout_iodone(struct objlayout_io_res *oir) -{ - if (likely(oir->status >= 0)) { - objio_free_result(oir); - } else { - struct objlayout *objlay = oir->objlay; - - spin_lock(&objlay->lock); - objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &oir->err_list); - spin_unlock(&objlay->lock); - } -} - -/* - * objlayout_io_set_result - Set an osd_error code on a specific osd comp. - * - * The @index component IO failed (error returned from target). Register - * the error for later reporting at layout-return. - */ -void -objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, - struct pnfs_osd_objid *pooid, int osd_error, - u64 offset, u64 length, bool is_write) -{ - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - - BUG_ON(index >= oir->num_comps); - if (osd_error) { - ioerr->oer_component = *pooid; - ioerr->oer_comp_offset = offset; - ioerr->oer_comp_length = length; - ioerr->oer_iswrite = is_write; - ioerr->oer_errno = osd_error; - - dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " - "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", - __func__, index, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - } else { - /* User need not call if no error is reported */ - ioerr->oer_errno = 0; - } -} - -/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). - * This is because the osd completion is called with ints-off from - * the block layer - */ -static void _rpc_read_complete(struct work_struct *work) -{ - struct rpc_task *task; - struct nfs_pgio_header *hdr; - - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); - - pnfs_ld_read_done(hdr); -} - -void -objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) -{ - struct nfs_pgio_header *hdr = oir->rpcdata; - - oir->status = hdr->task.tk_status = status; - if (status >= 0) - hdr->res.count = status; - else - hdr->pnfs_error = status; - objlayout_iodone(oir); - /* must not use oir after this point */ - - dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, - status, hdr->res.eof, sync); - - if (sync) - pnfs_ld_read_done(hdr); - else { - INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete); - schedule_work(&hdr->task.u.tk_work); - } -} - -/* - * Perform sync or async reads. - */ -enum pnfs_try_status -objlayout_read_pagelist(struct nfs_pgio_header *hdr) -{ - struct inode *inode = hdr->inode; - loff_t offset = hdr->args.offset; - size_t count = hdr->args.count; - int err; - loff_t eof; - - eof = i_size_read(inode); - if (unlikely(offset + count > eof)) { - if (offset >= eof) { - err = 0; - hdr->res.count = 0; - hdr->res.eof = 1; - /*FIXME: do we need to call pnfs_ld_read_done() */ - goto out; - } - count = eof - offset; - } - - hdr->res.eof = (offset + count) >= eof; - _fix_verify_io_params(hdr->lseg, &hdr->args.pages, - &hdr->args.pgbase, - hdr->args.offset, hdr->args.count); - - dprintk("%s: inode(%lx) offset 0x%llx count 0x%zx eof=%d\n", - __func__, inode->i_ino, offset, count, hdr->res.eof); - - err = objio_read_pagelist(hdr); - out: - if (unlikely(err)) { - hdr->pnfs_error = err; - dprintk("%s: Returned Error %d\n", __func__, err); - return PNFS_NOT_ATTEMPTED; - } - return PNFS_ATTEMPTED; -} - -/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). - * This is because the osd completion is called with ints-off from - * the block layer - */ -static void _rpc_write_complete(struct work_struct *work) -{ - struct rpc_task *task; - struct nfs_pgio_header *hdr; - - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); - - pnfs_ld_write_done(hdr); -} - -void -objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) -{ - struct nfs_pgio_header *hdr = oir->rpcdata; - - oir->status = hdr->task.tk_status = status; - if (status >= 0) { - hdr->res.count = status; - hdr->verf.committed = oir->committed; - } else { - hdr->pnfs_error = status; - } - objlayout_iodone(oir); - /* must not use oir after this point */ - - dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, - status, hdr->verf.committed, sync); - - if (sync) - pnfs_ld_write_done(hdr); - else { - INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete); - schedule_work(&hdr->task.u.tk_work); - } -} - -/* - * Perform sync or async writes. - */ -enum pnfs_try_status -objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how) -{ - int err; - - _fix_verify_io_params(hdr->lseg, &hdr->args.pages, - &hdr->args.pgbase, - hdr->args.offset, hdr->args.count); - - err = objio_write_pagelist(hdr, how); - if (unlikely(err)) { - hdr->pnfs_error = err; - dprintk("%s: Returned Error %d\n", __func__, err); - return PNFS_NOT_ATTEMPTED; - } - return PNFS_ATTEMPTED; -} - -void -objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *args) -{ - struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct pnfs_osd_layoutupdate lou; - __be32 *start; - - dprintk("%s: Begin\n", __func__); - - spin_lock(&objlay->lock); - lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); - lou.dsu_delta = objlay->delta_space_used; - objlay->delta_space_used = 0; - objlay->delta_space_valid = OBJ_DSU_INIT; - lou.olu_ioerr_flag = !list_empty(&objlay->err_list); - spin_unlock(&objlay->lock); - - start = xdr_reserve_space(xdr, 4); - - BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); - - *start = cpu_to_be32((xdr->p - start - 1) * 4); - - dprintk("%s: Return delta_space_used %lld err %d\n", __func__, - lou.dsu_delta, lou.olu_ioerr_flag); -} - -static int -err_prio(u32 oer_errno) -{ - switch (oer_errno) { - case 0: - return 0; - - case PNFS_OSD_ERR_RESOURCE: - return OSD_ERR_PRI_RESOURCE; - case PNFS_OSD_ERR_BAD_CRED: - return OSD_ERR_PRI_BAD_CRED; - case PNFS_OSD_ERR_NO_ACCESS: - return OSD_ERR_PRI_NO_ACCESS; - case PNFS_OSD_ERR_UNREACHABLE: - return OSD_ERR_PRI_UNREACHABLE; - case PNFS_OSD_ERR_NOT_FOUND: - return OSD_ERR_PRI_NOT_FOUND; - case PNFS_OSD_ERR_NO_SPACE: - return OSD_ERR_PRI_NO_SPACE; - default: - WARN_ON(1); - /* fallthrough */ - case PNFS_OSD_ERR_EIO: - return OSD_ERR_PRI_EIO; - } -} - -static void -merge_ioerr(struct pnfs_osd_ioerr *dest_err, - const struct pnfs_osd_ioerr *src_err) -{ - u64 dest_end, src_end; - - if (!dest_err->oer_errno) { - *dest_err = *src_err; - /* accumulated device must be blank */ - memset(&dest_err->oer_component.oid_device_id, 0, - sizeof(dest_err->oer_component.oid_device_id)); - - return; - } - - if (dest_err->oer_component.oid_partition_id != - src_err->oer_component.oid_partition_id) - dest_err->oer_component.oid_partition_id = 0; - - if (dest_err->oer_component.oid_object_id != - src_err->oer_component.oid_object_id) - dest_err->oer_component.oid_object_id = 0; - - if (dest_err->oer_comp_offset > src_err->oer_comp_offset) - dest_err->oer_comp_offset = src_err->oer_comp_offset; - - dest_end = end_offset(dest_err->oer_comp_offset, - dest_err->oer_comp_length); - src_end = end_offset(src_err->oer_comp_offset, - src_err->oer_comp_length); - if (dest_end < src_end) - dest_end = src_end; - - dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; - - if ((src_err->oer_iswrite == dest_err->oer_iswrite) && - (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { - dest_err->oer_errno = src_err->oer_errno; - } else if (src_err->oer_iswrite) { - dest_err->oer_iswrite = true; - dest_err->oer_errno = src_err->oer_errno; - } -} - -static void -encode_accumulated_error(struct objlayout *objlay, __be32 *p) -{ - struct objlayout_io_res *oir, *tmp; - struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - - list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { - unsigned i; - - for (i = 0; i < oir->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; - - if (!ioerr->oer_errno) - continue; - - printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " - "is_write=%d dev(%llx:%llx) par=0x%llx " - "obj=0x%llx offset=0x%llx length=0x%llx\n", - __func__, i, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - - merge_ioerr(&accumulated_err, ioerr); - } - list_del(&oir->err_list); - objio_free_result(oir); - } - - pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); -} - -void -objlayout_encode_layoutreturn(struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) -{ - struct pnfs_layout_hdr *pnfslay = args->layout; - struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_res *oir, *tmp; - __be32 *start; - - dprintk("%s: Begin\n", __func__); - start = xdr_reserve_space(xdr, 4); - BUG_ON(!start); - - spin_lock(&objlay->lock); - - list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { - __be32 *last_xdr = NULL, *p; - unsigned i; - int res = 0; - - for (i = 0; i < oir->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; - - if (!ioerr->oer_errno) - continue; - - dprintk("%s: err[%d]: errno=%d is_write=%d " - "dev(%llx:%llx) par=0x%llx obj=0x%llx " - "offset=0x%llx length=0x%llx\n", - __func__, i, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - - p = pnfs_osd_xdr_ioerr_reserve_space(xdr); - if (unlikely(!p)) { - res = -E2BIG; - break; /* accumulated_error */ - } - - last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); - } - - /* TODO: use xdr_write_pages */ - if (unlikely(res)) { - /* no space for even one error descriptor */ - BUG_ON(!last_xdr); - - /* we've encountered a situation with lots and lots of - * errors and no space to encode them all. Use the last - * available slot to report the union of all the - * remaining errors. - */ - encode_accumulated_error(objlay, last_xdr); - goto loop_done; - } - list_del(&oir->err_list); - objio_free_result(oir); - } -loop_done: - spin_unlock(&objlay->lock); - - *start = cpu_to_be32((xdr->p - start - 1) * 4); - dprintk("%s: Return\n", __func__); -} - -enum { - OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, - OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, - OSD_LOGIN_UPCALL_PATHLEN = 256 -}; - -static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; - -module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), - 0600); -MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); - -struct __auto_login { - char uri[OBJLAYOUT_MAX_URI_LEN]; - char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; - char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; -}; - -static int __objlayout_upcall(struct __auto_login *login) -{ - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL - }; - char *argv[8]; - int ret; - - if (unlikely(!osd_login_prog[0])) { - dprintk("%s: osd_login_prog is disabled\n", __func__); - return -EACCES; - } - - dprintk("%s uri: %s\n", __func__, login->uri); - dprintk("%s osdname %s\n", __func__, login->osdname); - dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); - - argv[0] = (char *)osd_login_prog; - argv[1] = "-u"; - argv[2] = login->uri; - argv[3] = "-o"; - argv[4] = login->osdname; - argv[5] = "-s"; - argv[6] = login->systemid_hex; - argv[7] = NULL; - - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - /* - * Disable the upcall mechanism if we're getting an ENOENT or - * EACCES error. The admin can re-enable it on the fly by using - * sysfs to set the objlayoutdriver.osd_login_prog module parameter once - * the problem has been fixed. - */ - if (ret == -ENOENT || ret == -EACCES) { - printk(KERN_ERR "PNFS-OBJ: %s was not found please set " - "objlayoutdriver.osd_login_prog kernel parameter!\n", - osd_login_prog); - osd_login_prog[0] = '\0'; - } - dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); - - return ret; -} - -/* Assume dest is all zeros */ -static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, - char *dest, int max_len, - const char *var_name) -{ - if (!s.len) - return; - - if (s.len >= max_len) { - pr_warn_ratelimited( - "objlayout_autologin: %s: s.len(%d) >= max_len(%d)", - var_name, s.len, max_len); - s.len = max_len - 1; /* space for null terminator */ - } - - memcpy(dest, s.data, s.len); -} - -/* Assume sysid is all zeros */ -static void _sysid_2_hex(struct nfs4_string s, - char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) -{ - int i; - char *cur; - - if (!s.len) - return; - - if (s.len != OSD_SYSTEMID_LEN) { - pr_warn_ratelimited( - "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", - s.len); - if (s.len > OSD_SYSTEMID_LEN) - s.len = OSD_SYSTEMID_LEN; - } - - cur = sysid; - for (i = 0; i < s.len; i++) - cur = hex_byte_pack(cur, s.data[i]); -} - -int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) -{ - int rc; - struct __auto_login login; - - if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) - return -ENODEV; - - memset(&login, 0, sizeof(login)); - __copy_nfsS_and_zero_terminate( - deviceaddr->oda_targetaddr.ota_netaddr.r_addr, - login.uri, sizeof(login.uri), "URI"); - - __copy_nfsS_and_zero_terminate( - deviceaddr->oda_osdname, - login.osdname, sizeof(login.osdname), "OSDNAME"); - - _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); - - rc = __objlayout_upcall(&login); - if (rc > 0) /* script returns positive values */ - rc = -ENODEV; - - return rc; -} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h deleted file mode 100644 index fc94a5872ed4..000000000000 --- a/fs/nfs/objlayout/objlayout.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Data types and function declerations for interfacing with the - * pNFS standard object layout driver. - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _OBJLAYOUT_H -#define _OBJLAYOUT_H - -#include <linux/nfs_fs.h> -#include <linux/pnfs_osd_xdr.h> -#include "../pnfs.h" - -/* - * per-inode layout - */ -struct objlayout { - struct pnfs_layout_hdr pnfs_layout; - - /* for layout_commit */ - enum osd_delta_space_valid_enum { - OBJ_DSU_INIT = 0, - OBJ_DSU_VALID, - OBJ_DSU_INVALID, - } delta_space_valid; - s64 delta_space_used; /* consumed by write ops */ - - /* for layout_return */ - spinlock_t lock; - struct list_head err_list; -}; - -static inline struct objlayout * -OBJLAYOUT(struct pnfs_layout_hdr *lo) -{ - return container_of(lo, struct objlayout, pnfs_layout); -} - -/* - * per-I/O operation state - * embedded in objects provider io_state data structure - */ -struct objlayout_io_res { - struct objlayout *objlay; - - void *rpcdata; - int status; /* res */ - int committed; /* res */ - - /* Error reporting (layout_return) */ - struct list_head err_list; - unsigned num_comps; - /* Pointer to array of error descriptors of size num_comps. - * It should contain as many entries as devices in the osd_layout - * that participate in the I/O. It is up to the io_engine to allocate - * needed space and set num_comps. - */ - struct pnfs_osd_ioerr *ioerrs; -}; - -static inline -void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, - struct pnfs_osd_ioerr *ioerrs, void *rpcdata, - struct pnfs_layout_hdr *pnfs_layout_type) -{ - oir->objlay = OBJLAYOUT(pnfs_layout_type); - oir->rpcdata = rpcdata; - INIT_LIST_HEAD(&oir->err_list); - oir->num_comps = num_comps; - oir->ioerrs = ioerrs; -} - -/* - * Raid engine I/O API - */ -extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, - struct pnfs_layout_hdr *pnfslay, - struct pnfs_layout_range *range, - struct xdr_stream *xdr, - gfp_t gfp_flags); -extern void objio_free_lseg(struct pnfs_layout_segment *lseg); - -/* objio_free_result will free these @oir structs received from - * objlayout_{read,write}_done - */ -extern void objio_free_result(struct objlayout_io_res *oir); - -extern int objio_read_pagelist(struct nfs_pgio_header *rdata); -extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how); - -/* - * callback API - */ -extern void objlayout_io_set_result(struct objlayout_io_res *oir, - unsigned index, struct pnfs_osd_objid *pooid, - int osd_error, u64 offset, u64 length, bool is_write); - -static inline void -objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) -{ - /* If one of the I/Os errored out and the delta_space_used was - * invalid we render the complete report as invalid. Protocol mandate - * the DSU be accurate or not reported. - */ - spin_lock(&objlay->lock); - if (objlay->delta_space_valid != OBJ_DSU_INVALID) { - objlay->delta_space_valid = OBJ_DSU_VALID; - objlay->delta_space_used += space_used; - } - spin_unlock(&objlay->lock); -} - -extern void objlayout_read_done(struct objlayout_io_res *oir, - ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_res *oir, - ssize_t status, bool sync); - -/* - * exported generic objects function vectors - */ - -extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); -extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); - -extern struct pnfs_layout_segment *objlayout_alloc_lseg( - struct pnfs_layout_hdr *, - struct nfs4_layoutget_res *, - gfp_t gfp_flags); -extern void objlayout_free_lseg(struct pnfs_layout_segment *); - -extern enum pnfs_try_status objlayout_read_pagelist( - struct nfs_pgio_header *); - -extern enum pnfs_try_status objlayout_write_pagelist( - struct nfs_pgio_header *, - int how); - -extern void objlayout_encode_layoutcommit( - struct pnfs_layout_hdr *, - struct xdr_stream *, - const struct nfs4_layoutcommit_args *); - -extern void objlayout_encode_layoutreturn( - struct xdr_stream *, - const struct nfs4_layoutreturn_args *); - -extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); - -#endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c deleted file mode 100644 index f093c7ec983b..000000000000 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ /dev/null @@ -1,415 +0,0 @@ -/* - * Object-Based pNFS Layout XDR layer - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/pnfs_osd_xdr.h> - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -/* - * The following implementation is based on RFC5664 - */ - -/* - * struct pnfs_osd_objid { - * struct nfs4_deviceid oid_device_id; - * u64 oid_partition_id; - * u64 oid_object_id; - * }; // xdr size 32 bytes - */ -static __be32 * -_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) -{ - p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, - sizeof(objid->oid_device_id.data)); - - p = xdr_decode_hyper(p, &objid->oid_partition_id); - p = xdr_decode_hyper(p, &objid->oid_object_id); - return p; -} -/* - * struct pnfs_osd_opaque_cred { - * u32 cred_len; - * void *cred; - * }; // xdr size [variable] - * The return pointers are from the xdr buffer - */ -static int -_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, - struct xdr_stream *xdr) -{ - __be32 *p = xdr_inline_decode(xdr, 1); - - if (!p) - return -EINVAL; - - opaque_cred->cred_len = be32_to_cpu(*p++); - - p = xdr_inline_decode(xdr, opaque_cred->cred_len); - if (!p) - return -EINVAL; - - opaque_cred->cred = p; - return 0; -} - -/* - * struct pnfs_osd_object_cred { - * struct pnfs_osd_objid oc_object_id; - * u32 oc_osd_version; - * u32 oc_cap_key_sec; - * struct pnfs_osd_opaque_cred oc_cap_key - * struct pnfs_osd_opaque_cred oc_cap; - * }; // xdr size 32 + 4 + 4 + [variable] + [variable] - */ -static int -_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, - struct xdr_stream *xdr) -{ - __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); - int ret; - - if (!p) - return -EIO; - - p = _osd_xdr_decode_objid(p, &comp->oc_object_id); - comp->oc_osd_version = be32_to_cpup(p++); - comp->oc_cap_key_sec = be32_to_cpup(p); - - ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); - if (unlikely(ret)) - return ret; - - ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); - return ret; -} - -/* - * struct pnfs_osd_data_map { - * u32 odm_num_comps; - * u64 odm_stripe_unit; - * u32 odm_group_width; - * u32 odm_group_depth; - * u32 odm_mirror_cnt; - * u32 odm_raid_algorithm; - * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 - */ -static inline int -_osd_data_map_xdr_sz(void) -{ - return 4 + 8 + 4 + 4 + 4 + 4; -} - -static __be32 * -_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) -{ - data_map->odm_num_comps = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); - data_map->odm_group_width = be32_to_cpup(p++); - data_map->odm_group_depth = be32_to_cpup(p++); - data_map->odm_mirror_cnt = be32_to_cpup(p++); - data_map->odm_raid_algorithm = be32_to_cpup(p++); - dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " - "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", - __func__, - data_map->odm_num_comps, - (unsigned long long)data_map->odm_stripe_unit, - data_map->odm_group_width, - data_map->odm_group_depth, - data_map->odm_mirror_cnt, - data_map->odm_raid_algorithm); - return p; -} - -int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) -{ - __be32 *p; - - memset(iter, 0, sizeof(*iter)); - - p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); - if (unlikely(!p)) - return -EINVAL; - - p = _osd_xdr_decode_data_map(p, &layout->olo_map); - layout->olo_comps_index = be32_to_cpup(p++); - layout->olo_num_comps = be32_to_cpup(p++); - dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, - layout->olo_comps_index, layout->olo_num_comps); - - iter->total_comps = layout->olo_num_comps; - return 0; -} - -bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, - int *err) -{ - BUG_ON(iter->decoded_comps > iter->total_comps); - if (iter->decoded_comps == iter->total_comps) - return false; - - *err = _osd_xdr_decode_object_cred(comp, xdr); - if (unlikely(*err)) { - dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " - "total_comps=%d\n", __func__, *err, - iter->decoded_comps, iter->total_comps); - return false; /* stop the loop */ - } - dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " - "key_len=%u cap_len=%u\n", - __func__, - _DEVID_LO(&comp->oc_object_id.oid_device_id), - _DEVID_HI(&comp->oc_object_id.oid_device_id), - comp->oc_object_id.oid_partition_id, - comp->oc_object_id.oid_object_id, - comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); - - iter->decoded_comps++; - return true; -} - -/* - * Get Device Information Decoding - * - * Note: since Device Information is currently done synchronously, all - * variable strings fields are left inside the rpc buffer and are only - * pointed to by the pnfs_osd_deviceaddr members. So the read buffer - * should not be freed while the returned information is in use. - */ -/* - *struct nfs4_string { - * unsigned int len; - * char *data; - *}; // size [variable] - * NOTE: Returned string points to inside the XDR buffer - */ -static __be32 * -__read_u8_opaque(__be32 *p, struct nfs4_string *str) -{ - str->len = be32_to_cpup(p++); - str->data = (char *)p; - - p += XDR_QUADLEN(str->len); - return p; -} - -/* - * struct pnfs_osd_targetid { - * u32 oti_type; - * struct nfs4_string oti_scsi_device_id; - * };// size 4 + [variable] - */ -static __be32 * -__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) -{ - u32 oti_type; - - oti_type = be32_to_cpup(p++); - targetid->oti_type = oti_type; - - switch (oti_type) { - case OBJ_TARGET_SCSI_NAME: - case OBJ_TARGET_SCSI_DEVICE_ID: - p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); - } - - return p; -} - -/* - * struct pnfs_osd_net_addr { - * struct nfs4_string r_netid; - * struct nfs4_string r_addr; - * }; - */ -static __be32 * -__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) -{ - p = __read_u8_opaque(p, &netaddr->r_netid); - p = __read_u8_opaque(p, &netaddr->r_addr); - - return p; -} - -/* - * struct pnfs_osd_targetaddr { - * u32 ota_available; - * struct pnfs_osd_net_addr ota_netaddr; - * }; - */ -static __be32 * -__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) -{ - u32 ota_available; - - ota_available = be32_to_cpup(p++); - targetaddr->ota_available = ota_available; - - if (ota_available) - p = __read_net_addr(p, &targetaddr->ota_netaddr); - - - return p; -} - -/* - * struct pnfs_osd_deviceaddr { - * struct pnfs_osd_targetid oda_targetid; - * struct pnfs_osd_targetaddr oda_targetaddr; - * u8 oda_lun[8]; - * struct nfs4_string oda_systemid; - * struct pnfs_osd_object_cred oda_root_obj_cred; - * struct nfs4_string oda_osdname; - * }; - */ - -/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does - * not have an xdr_stream - */ -static __be32 * -__read_opaque_cred(__be32 *p, - struct pnfs_osd_opaque_cred *opaque_cred) -{ - opaque_cred->cred_len = be32_to_cpu(*p++); - opaque_cred->cred = p; - return p + XDR_QUADLEN(opaque_cred->cred_len); -} - -static __be32 * -__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) -{ - p = _osd_xdr_decode_objid(p, &comp->oc_object_id); - comp->oc_osd_version = be32_to_cpup(p++); - comp->oc_cap_key_sec = be32_to_cpup(p++); - - p = __read_opaque_cred(p, &comp->oc_cap_key); - p = __read_opaque_cred(p, &comp->oc_cap); - return p; -} - -void pnfs_osd_xdr_decode_deviceaddr( - struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) -{ - p = __read_targetid(p, &deviceaddr->oda_targetid); - - p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); - - p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, - sizeof(deviceaddr->oda_lun)); - - p = __read_u8_opaque(p, &deviceaddr->oda_systemid); - - p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); - - p = __read_u8_opaque(p, &deviceaddr->oda_osdname); - - /* libosd likes this terminated in dbg. It's last, so no problems */ - deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; -} - -/* - * struct pnfs_osd_layoutupdate { - * u32 dsu_valid; - * s64 dsu_delta; - * u32 olu_ioerr_flag; - * }; xdr size 4 + 8 + 4 - */ -int -pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, - struct pnfs_osd_layoutupdate *lou) -{ - __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); - - if (!p) - return -E2BIG; - - *p++ = cpu_to_be32(lou->dsu_valid); - if (lou->dsu_valid) - p = xdr_encode_hyper(p, lou->dsu_delta); - *p++ = cpu_to_be32(lou->olu_ioerr_flag); - return 0; -} - -/* - * struct pnfs_osd_objid { - * struct nfs4_deviceid oid_device_id; - * u64 oid_partition_id; - * u64 oid_object_id; - * }; // xdr size 32 bytes - */ -static inline __be32 * -pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) -{ - p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, - sizeof(object_id->oid_device_id.data)); - p = xdr_encode_hyper(p, object_id->oid_partition_id); - p = xdr_encode_hyper(p, object_id->oid_object_id); - - return p; -} - -/* - * struct pnfs_osd_ioerr { - * struct pnfs_osd_objid oer_component; - * u64 oer_comp_offset; - * u64 oer_comp_length; - * u32 oer_iswrite; - * u32 oer_errno; - * }; // xdr size 32 + 24 bytes - */ -void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) -{ - p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); - p = xdr_encode_hyper(p, ioerr->oer_comp_offset); - p = xdr_encode_hyper(p, ioerr->oer_comp_length); - *p++ = cpu_to_be32(ioerr->oer_iswrite); - *p = cpu_to_be32(ioerr->oer_errno); -} - -__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, 32 + 24); - if (unlikely(!p)) - dprintk("%s: out of xdr space\n", __func__); - - return p; -} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6e629b856a00..ad92b401326c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -29,19 +29,6 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; -static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) -{ - p->npages = pagecount; - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); - if (!p->pagevec) - p->npages = 0; - } - return p->pagevec != NULL; -} - struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) { @@ -115,6 +102,35 @@ nfs_iocounter_wait(struct nfs_lock_context *l_ctx) TASK_KILLABLE); } +/** + * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O + * to complete + * @task: the rpc_task that should wait + * @l_ctx: nfs_lock_context with io_counter to check + * + * Returns true if there is outstanding I/O to wait on and the + * task has been put to sleep. + */ +bool +nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) +{ + struct inode *inode = d_inode(l_ctx->open_context->dentry); + bool ret = false; + + if (atomic_read(&l_ctx->io_count) > 0) { + rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL); + ret = true; + } + + if (atomic_read(&l_ctx->io_count) == 0) { + rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task); + ret = false; + } + + return ret; +} +EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); + /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked @@ -398,8 +414,11 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { - if (atomic_dec_and_test(&l_ctx->io_count)) + if (atomic_dec_and_test(&l_ctx->io_count)) { wake_up_atomic_t(&l_ctx->io_count); + if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags)) + rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq); + } nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } @@ -677,7 +696,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int io_flags) + int io_flags, + gfp_t gfp_flags) { struct nfs_pgio_mirror *new; int i; @@ -701,7 +721,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, /* until we have a request, we don't have an lseg and no * idea how many mirrors there will be */ new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, - sizeof(struct nfs_pgio_mirror), GFP_KERNEL); + sizeof(struct nfs_pgio_mirror), gfp_flags); desc->pg_mirrors_dynamic = new; desc->pg_mirrors = new; @@ -754,13 +774,24 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, *last_page; struct list_head *head = &mirror->pg_list; struct nfs_commit_info cinfo; + struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; + gfp_t gfp_flags = GFP_KERNEL; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { - nfs_pgio_error(hdr); - desc->pg_error = -ENOMEM; - return desc->pg_error; + + if (pagecount <= ARRAY_SIZE(pg_array->page_array)) + pg_array->pagevec = pg_array->page_array; + else { + if (hdr->rw_mode == FMODE_WRITE) + gfp_flags = GFP_NOIO; + pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags); + if (!pg_array->pagevec) { + pg_array->npages = 0; + nfs_pgio_error(hdr); + desc->pg_error = -ENOMEM; + return desc->pg_error; + } } nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); @@ -1256,8 +1287,10 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) mirror = &desc->pg_mirrors[midx]; if (!list_empty(&mirror->pg_list)) { prev = nfs_list_entry(mirror->pg_list.prev); - if (index != prev->wb_index + 1) - nfs_pageio_complete_mirror(desc, midx); + if (index != prev->wb_index + 1) { + nfs_pageio_complete(desc); + break; + } } } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index dd042498ce7c..c383d0913b54 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -322,9 +322,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, static void pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) { + struct pnfs_layout_segment *lseg; lo->plh_return_iomode = 0; lo->plh_return_seq = 0; clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + continue; + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); + } } static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) @@ -367,9 +373,9 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, *next; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_clear_layoutreturn_info(lo); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) pnfs_clear_lseg_state(lseg, lseg_list); + pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) @@ -563,7 +569,6 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) } } } -EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); /* * is l2 fully contained in l1? @@ -728,6 +733,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); + nfs_commit_inode(&nfsi->vfs_inode, 0); pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); @@ -1209,7 +1215,6 @@ out: dprintk("<-- %s status: %d\n", __func__, status); return status; } -EXPORT_SYMBOL_GPL(_pnfs_return_layout); int pnfs_commit_and_return_layout(struct inode *inode) @@ -1991,6 +1996,8 @@ out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + if (!pnfs_layout_is_valid(lo)) + nfs_commit_inode(ino, 0); return ERR_PTR(-EAGAIN); } @@ -2051,9 +2058,11 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_set_plh_return_info(lo, range.iomode, 0); - /* Block LAYOUTGET */ - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() @@ -2075,10 +2084,36 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); void +pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) +{ + if (pgio->pg_lseg == NULL || + test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) + return; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); + +/* + * Check for any intersection between the request and the pgio->pg_lseg, + * and if none, put this pgio->pg_lseg away. + */ +static void +pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + } +} + +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size = req->wb_bytes; + pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); @@ -2109,6 +2144,8 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { + pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -2169,16 +2206,10 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset, pgio->pg_lseg->pls_range.length); req_start = req_offset(req); - WARN_ON_ONCE(req_start >= seg_end); + /* start of request is past the last byte of this segment */ - if (req_start >= seg_end) { - /* reference the new lseg */ - if (pgio->pg_ops->pg_cleanup) - pgio->pg_ops->pg_cleanup(pgio); - if (pgio->pg_ops->pg_init) - pgio->pg_ops->pg_init(pgio, req); + if (req_start >= seg_end) return 0; - } /* adjust 'size' iff there are fewer bytes left in the * segment than what nfs_generic_pg_test returned */ @@ -2277,8 +2308,20 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, enum pnfs_try_status trypnfs; trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) + switch (trypnfs) { + case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + case PNFS_ATTEMPTED: + break; + case PNFS_TRY_AGAIN: + /* cleanup hdr and prepare to redo pnfs */ + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + list_splice_init(&hdr->pages, &mirror->pg_list); + mirror->pg_recoalesce = 1; + } + hdr->mds_ops->rpc_release(hdr); + } } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) @@ -2408,10 +2451,20 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) enum pnfs_try_status trypnfs; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); - if (trypnfs == PNFS_TRY_AGAIN) - pnfs_read_resend_pnfs(hdr); - if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status) + switch (trypnfs) { + case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + case PNFS_ATTEMPTED: + break; + case PNFS_TRY_AGAIN: + /* cleanup hdr and prepare to redo pnfs */ + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + list_splice_init(&hdr->pages, &mirror->pg_list); + mirror->pg_recoalesce = 1; + } + hdr->mds_ops->rpc_release(hdr); + } } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 590e1e35781f..99731e3e332f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -173,14 +173,9 @@ struct pnfs_layoutdriver_type { gfp_t gfp_flags); int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *); - void (*encode_layoutreturn) (struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); - void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); }; @@ -239,6 +234,7 @@ void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); +void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, @@ -597,6 +593,16 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2); } +static inline bool +pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page *req) +{ + u64 seg_last = pnfs_end_offset(lseg->pls_range.offset, lseg->pls_range.length); + u64 req_last = req_offset(req) + req->wb_bytes; + + return pnfs_is_range_intersecting(lseg->pls_range.offset, seg_last, + req_offset(req), req_last); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7250b95549ec..d40755a0984b 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -217,7 +217,14 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - data = nfs_commitdata_alloc(); + /* + * If the layout segment is invalid, then let + * pnfs_generic_retry_commit() clean up the bucket. + */ + if (bucket->clseg && !pnfs_is_valid_lseg(bucket->clseg) && + !test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags)) + break; + data = nfs_commitdata_alloc(false); if (!data) break; data->ds_commit_index = i; @@ -283,16 +290,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(); - if (data != NULL) { - data->ds_commit_index = -1; - list_add(&data->pages, &list); - nreq++; - } else { - nfs_retry_commit(mds_pages, NULL, cinfo, 0); - pnfs_generic_retry_commit(cinfo, 0); - return -ENOMEM; - } + data = nfs_commitdata_alloc(true); + data->ds_commit_index = -1; + list_add(&data->pages, &list); + nreq++; } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); @@ -619,7 +620,6 @@ void nfs4_pnfs_v3_ds_connect_unload(void) get_v3_ds_connect = NULL; } } -EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload); static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b7bca8303989..9872cf676a50 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -638,7 +638,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); - return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); + return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL); } /* Helper functions for NFS lock bounds checking */ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index defc9233e985..a8421d9dab6a 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -35,7 +35,11 @@ static struct kmem_cache *nfs_rdata_cachep; static struct nfs_pgio_header *nfs_readhdr_alloc(void) { - return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + + if (p) + p->rw_mode = FMODE_READ; + return p; } static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) @@ -64,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_read_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, - server->rsize, 0); + server->rsize, 0, GFP_KERNEL); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); @@ -451,7 +455,6 @@ void nfs_destroy_readpagecache(void) } static const struct nfs_rw_ops nfs_rw_read_ops = { - .rw_mode = FMODE_READ, .rw_alloc_header = nfs_readhdr_alloc, .rw_free_header = nfs_readhdr_free, .rw_done = nfs_readpage_done, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f3822a4a7d5..c5334c0e23a1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2301,7 +2301,7 @@ EXPORT_SYMBOL_GPL(nfs_remount); /* * Initialise the common bits of the superblock */ -inline void nfs_initialise_sb(struct super_block *sb) +static void nfs_initialise_sb(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); @@ -2348,7 +2348,8 @@ EXPORT_SYMBOL_GPL(nfs_fill_super); /* * Finish setting up a cloned NFS2/3/4 superblock */ -void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) +static void nfs_clone_super(struct super_block *sb, + struct nfs_mount_info *mount_info) { const struct super_block *old_sb = mount_info->cloned->sb; struct nfs_server *server = NFS_SB(sb); @@ -2544,10 +2545,25 @@ EXPORT_SYMBOL_GPL(nfs_set_sb_security); int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { + int error; + unsigned long kflags = 0, kflags_out = 0; + /* clone any lsm security options from the parent to the new sb */ if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; - return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + + error = security_sb_clone_mnt_opts(mount_info->cloned->sb, s, kflags, + &kflags_out); + if (error) + return error; + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; + return 0; } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index cc341fc7fd44..db7ba542559e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(void) +struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) { - struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + struct nfs_commit_data *p; - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); + if (never_fail) + p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + else { + /* It is OK to do some reclaim, not no safe to wait + * for anything to be returned to the pool. + * mempool_alloc() cannot handle that particular combination, + * so we need two separate attempts. + */ + p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); + if (!p) + p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | + __GFP_NOWARN | __GFP_NORETRY); + if (!p) + return NULL; } + + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); return p; } EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); @@ -82,8 +96,10 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void) { struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - if (p) + if (p) { memset(p, 0, sizeof(*p)); + p->rw_mode = FMODE_WRITE; + } return p; } @@ -547,9 +563,21 @@ static void nfs_write_error_remove_page(struct nfs_page *req) { nfs_unlock_request(req); nfs_end_page_writeback(req); - nfs_release_request(req); generic_error_remove_page(page_file_mapping(req->wb_page), req->wb_page); + nfs_release_request(req); +} + +static bool +nfs_error_is_fatal_on_server(int err) +{ + switch (err) { + case 0: + case -ERESTARTSYS: + case -EINTR: + return false; + } + return nfs_error_is_fatal(err); } /* @@ -557,8 +585,7 @@ static void nfs_write_error_remove_page(struct nfs_page *req) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock, - bool launder) + struct page *page, bool nonblock) { struct nfs_page *req; int ret = 0; @@ -574,19 +601,19 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); ret = 0; + /* If there is a fatal error that covers this write, just exit */ + if (nfs_error_is_fatal_on_server(req->wb_context->error)) + goto out_launder; + if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* - * Remove the problematic req upon fatal errors - * in launder case, while other dirty pages can - * still be around until they get flushed. + * Remove the problematic req upon fatal errors on the server */ if (nfs_error_is_fatal(ret)) { nfs_context_set_write_error(req->wb_context, ret); - if (launder) { - nfs_write_error_remove_page(req); - goto out; - } + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; } nfs_redirty_request(req); ret = -EAGAIN; @@ -595,16 +622,18 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, NFSIOS_WRITEPAGES, 1); out: return ret; +out_launder: + nfs_write_error_remove_page(req); + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio, bool launder) + struct nfs_pageio_descriptor *pgio) { int ret; nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, - launder); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -616,8 +645,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, * Write an mmapped page to the server. */ static int nfs_writepage_locked(struct page *page, - struct writeback_control *wbc, - bool launder) + struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; struct inode *inode = page_file_mapping(page)->host; @@ -626,7 +654,7 @@ static int nfs_writepage_locked(struct page *page, nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio, launder); + err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) return err; @@ -639,7 +667,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) { int ret; - ret = nfs_writepage_locked(page, wbc, false); + ret = nfs_writepage_locked(page, wbc); unlock_page(page); return ret; } @@ -648,7 +676,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * { int ret; - ret = nfs_do_writepage(page, wbc, data, false); + ret = nfs_do_writepage(page, wbc, data); unlock_page(page); return ret; } @@ -1367,7 +1395,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_write_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, - server->wsize, ioflags); + server->wsize, ioflags, GFP_NOIO); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); @@ -1704,50 +1732,14 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(); - - if (!data) - goto out_bad; + data = nfs_commitdata_alloc(true); /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), data->mds_ops, how, 0); - out_bad: - nfs_retry_commit(head, NULL, cinfo, 0); - return -ENOMEM; -} - -int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf) -{ - struct inode *inode = file_inode(file); - struct nfs_open_context *open; - struct nfs_commit_info cinfo; - struct nfs_page *req; - int ret; - - open = get_nfs_open_context(nfs_file_open_context(file)); - req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode)); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - goto out_put; - } - - nfs_init_cinfo_from_inode(&cinfo, inode); - - memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier)); - nfs_request_add_commit_list(req, &cinfo); - ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret > 0) - ret = 0; - - nfs_free_request(req); -out_put: - put_nfs_open_context(open); - return ret; } -EXPORT_SYMBOL_GPL(nfs_commit_file); /* * COMMIT call returned @@ -1985,7 +1977,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* * Write back all requests on one page - we do this before reading it. */ -int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) +int nfs_wb_page(struct inode *inode, struct page *page) { loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); @@ -2002,7 +1994,7 @@ int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc, launder); + ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; continue; @@ -2107,7 +2099,6 @@ void nfs_destroy_writepagecache(void) } static const struct nfs_rw_ops nfs_rw_write_ops = { - .rw_mode = FMODE_WRITE, .rw_alloc_header = nfs_writehdr_alloc, .rw_free_header = nfs_writehdr_free, .rw_done = nfs_writeback_done, diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index fb5213afc854..c862c2489df0 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -219,6 +219,9 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, u8 *buf, *d, type, assoc; int error; + if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) + return -EINVAL; + buf = kzalloc(bufflen, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -229,7 +232,6 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, goto out_free_buf; } req = scsi_req(rq); - scsi_req_init(rq); error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); if (error) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index e71f11b1a180..3bc08c394a3f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -486,7 +486,7 @@ secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif static inline int -uuid_parse(char **mesg, char *buf, unsigned char **puuid) +nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { int len; @@ -586,7 +586,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) - err = uuid_parse(&mesg, buf, &exp.ex_uuid); + err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d86031b6ad79..dadb3bf305b2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1259,7 +1259,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) return NULL; } - if (!(exp->ex_layout_types & (1 << layout_type))) { + if (layout_type >= LAYOUT_TYPE_MAX || + !(exp->ex_layout_types & (1 << layout_type))) { dprintk("%s: layout type %d not supported\n", __func__, layout_type); return NULL; @@ -1768,6 +1769,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, opdesc->op_get_currentstateid(cstate, &op->u); op->status = opdesc->op_func(rqstp, cstate, &op->u); + /* Only from SEQUENCE */ + if (cstate->status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } if (!op->status) { if (opdesc->op_set_currentstateid) opdesc->op_set_currentstateid(cstate, &op->u); @@ -1778,14 +1785,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (need_wrongsec_check(rqstp)) op->status = check_nfsd_access(current_fh->fh_export, rqstp); } - encode_op: - /* Only from SEQUENCE */ - if (cstate->status == nfserr_replay_cache) { - dprintk("%s NFS4.1 replay from cache\n", __func__); - status = op->status; - goto out; - } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(&resp->xdr, op); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e9ef50addddb..22002fb75a18 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1912,28 +1912,15 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) target->cl_clientid.cl_id = source->cl_clientid.cl_id; } -int strdup_if_nonnull(char **target, char *source) -{ - if (source) { - *target = kstrdup(source, GFP_KERNEL); - if (!*target) - return -ENOMEM; - } else - *target = NULL; - return 0; -} - static int copy_cred(struct svc_cred *target, struct svc_cred *source) { - int ret; + target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL); + target->cr_raw_principal = kstrdup(source->cr_raw_principal, + GFP_KERNEL); + if ((source->cr_principal && ! target->cr_principal) || + (source->cr_raw_principal && ! target->cr_raw_principal)) + return -ENOMEM; - ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal); - if (ret) - return ret; - ret = strdup_if_nonnull(&target->cr_raw_principal, - source->cr_raw_principal); - if (ret) - return ret; target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 33017d652b1d..26780d53a6f9 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2831,9 +2831,14 @@ out_acl: } #endif /* CONFIG_NFSD_PNFS */ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0, - NFSD_SUPPATTR_EXCLCREAT_WORD1, - NFSD_SUPPATTR_EXCLCREAT_WORD2); + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); if (status) goto out; } @@ -4119,8 +4124,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getdeviceinfo *gdev) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[gdev->gd_layout_type]; + const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; __be32 *p; @@ -4137,6 +4141,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, /* If maxcount is 0 then just update notifications */ if (gdev->gd_maxcount != 0) { + ops = nfsd4_layout_ops[gdev->gd_layout_type]; nfserr = ops->encode_getdeviceinfo(xdr, gdev); if (nfserr) { /* @@ -4189,8 +4194,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutget *lgp) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[lgp->lg_layout_type]; + const struct nfsd4_layout_ops *ops; __be32 *p; dprintk("%s: err %d\n", __func__, nfserr); @@ -4213,6 +4217,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(lgp->lg_seg.iomode); *p++ = cpu_to_be32(lgp->lg_layout_type); + ops = nfsd4_layout_ops[lgp->lg_layout_type]; nfserr = ops->encode_layoutget(xdr, lgp); out: kfree(lgp->lg_content); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 8bf8f667a8cf..6493df6b1bd5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1146,7 +1146,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) static int nfsd_fill_super(struct super_block * sb, void * data, int silent) { - static struct tree_descr nfsd_files[] = { + static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9aaf6ca77569..38d0383dc7f9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -94,6 +94,12 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, err = follow_down(&path); if (err < 0) goto out; + if (path.mnt == exp->ex_path.mnt && path.dentry == dentry && + nfsd_mountpoint(dentry, exp) == 2) { + /* This is only a mountpoint in some other namespace */ + path_put(&path); + goto out; + } exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { @@ -167,16 +173,26 @@ static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, st /* * For nfsd purposes, we treat V4ROOT exports as though there was an * export at *every* directory. + * We return: + * '1' if this dentry *must* be an export point, + * '2' if it might be, if there is really a mount here, and + * '0' if there is no chance of an export point here. */ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) { - if (d_mountpoint(dentry)) + if (!d_inode(dentry)) + return 0; + if (exp->ex_flags & NFSEXP_V4ROOT) return 1; if (nfsd4_is_junction(dentry)) return 1; - if (!(exp->ex_flags & NFSEXP_V4ROOT)) - return 0; - return d_inode(dentry) != NULL; + if (d_mountpoint(dentry)) + /* + * Might only be a mountpoint in a different namespace, + * but we need to check. + */ + return 2; + return 0; } __be32 @@ -895,24 +911,13 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - mm_segment_t oldfs; + struct iov_iter iter; int host_err; - oldfs = get_fs(); - set_fs(KERNEL_DS); - host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0); - set_fs(oldfs); - return nfsd_finish_read(file, count, host_err); -} + iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count); + host_err = vfs_iter_read(file, &iter, &offset, 0); -static __be32 -nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) -{ - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - return nfsd_splice_read(rqstp, file, offset, count); - else - return nfsd_readv(file, offset, vec, vlen, count); + return nfsd_finish_read(file, count, host_err); } /* @@ -958,7 +963,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, unsigned long *cnt, int stable) { struct svc_export *exp; - mm_segment_t oldfs; + struct iov_iter iter; __be32 err = 0; int host_err; int use_wgather; @@ -984,10 +989,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (stable && !use_wgather) flags |= RWF_SYNC; - /* Write the data. */ - oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, flags); - set_fs(oldfs); + iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt); + host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) goto out_nfserr; *cnt = host_err; @@ -1028,7 +1031,12 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ra = nfsd_init_raparms(file); trace_read_opened(rqstp, fhp, offset, vlen); - err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) + err = nfsd_splice_read(rqstp, file, offset, count); + else + err = nfsd_readv(file, offset, vec, vlen, count); + trace_read_io_done(rqstp, fhp, offset, vlen); if (ra) @@ -1448,41 +1456,34 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) { - mm_segment_t oldfs; __be32 err; - int host_err; + const char *link; struct path path; + DEFINE_DELAYED_CALL(done); + int len; err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); - if (err) - goto out; + if (unlikely(err)) + return err; path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - err = nfserr_inval; - if (!d_is_symlink(path.dentry)) - goto out; + if (unlikely(!d_is_symlink(path.dentry))) + return nfserr_inval; touch_atime(&path); - /* N.B. Why does this call need a get_fs()?? - * Remove the set_fs and watch the fireworks:-) --okir - */ - oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_readlink(path.dentry, (char __user *)buf, *lenp); - set_fs(oldfs); + link = vfs_get_link(path.dentry, &done); + if (IS_ERR(link)) + return nfserrno(PTR_ERR(link)); - if (host_err < 0) - goto out_nfserr; - *lenp = host_err; - err = 0; -out: - return err; - -out_nfserr: - err = nfserrno(host_err); - goto out; + len = strlen(link); + if (len < *lenp) + *lenp = len; + memcpy(buf, link, *lenp); + do_delayed_call(&done); + return 0; } /* diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6f87b2ac1aeb..e73c86d9855c 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,7 +338,7 @@ static void nilfs_end_bio_write(struct bio *bio) { struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&segbuf->sb_err); bio_put(bio); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index febed1217b3f..70ded52dc1dd 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino) } struct nilfs_segctor_wait_request { - wait_queue_t wq; + wait_queue_entry_t wq; __u32 seq; int err; atomic_t done; @@ -2206,8 +2206,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) unsigned long flags; spin_lock_irqsave(&sci->sc_wait_request.lock, flags); - list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, - wq.task_list) { + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) { if (!atomic_read(&wrq->done) && nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { wrq->err = err; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 01a9f0f007d4..0c4583b61717 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -161,16 +161,20 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask if (unlikely(!fsnotify_inode_watches_children(p_inode))) __fsnotify_update_child_dentry_flags(p_inode); else if (p_inode->i_fsnotify_mask & mask) { + struct name_snapshot name; + /* we are notifying a parent so come up with the new mask which * specifies these are events which came from a child. */ mask |= FS_EVENT_ON_CHILD; + take_dentry_name_snapshot(&name, dentry); if (path) ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, - dentry->d_name.name, 0); + name.name, 0); else ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - dentry->d_name.name, 0); + name.name, 0); + release_dentry_name_snapshot(&name); } dput(parent); diff --git a/fs/nsfs.c b/fs/nsfs.c index 323f492e0822..f3db56e83dd2 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -196,9 +196,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, { struct ns_common *ns; int res = -ENOENT; + const char *name; ns = ns_ops->get(task); if (ns) { - res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + name = ns_ops->real_ns_name ? : ns_ops->name; + res = snprintf(buf, size, "%s:[%u]", name, ns->inum); ns_ops->put(ns); } return res; diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 358258364616..4690cd75d8d7 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -159,7 +159,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, PTR_ERR(dent_inode)); kfree(name); /* Return the error code. */ - return (struct dentry *)dent_inode; + return ERR_CAST(dent_inode); } /* It is guaranteed that @name is no longer allocated at this point. */ if (MREF_ERR(mref) == -ENOENT) { diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0da0332725aa..ffe003982d95 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -516,9 +516,9 @@ static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (bio->bi_error) { - mlog(ML_ERROR, "IO Error %d\n", bio->bi_error); - wc->wc_error = bio->bi_error; + if (bio->bi_status) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_status); + wc->wc_error = blk_status_to_errno(bio->bi_status); } o2hb_bio_wait_dec(wc, 1); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 564c504d6efd..74a21f6695c8 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -426,6 +426,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) struct o2net_sock_container *dummy_sc = sd->dbg_sock; o2net_debug_del_sc(dummy_sc); + kfree(dummy_sc); return seq_release_private(inode, file); } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3b7c937a36b5..4689940a953c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2591,6 +2591,10 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, struct ocfs2_lock_res *lockres; lockres = &OCFS2_I(inode)->ip_inode_lockres; + /* had_lock means that the currect process already takes the cluster + * lock previously. If had_lock is 1, we have nothing to do here, and + * it will get unlocked where we got the lock. + */ if (!had_lock) { ocfs2_remove_holder(lockres, oh); ocfs2_inode_unlock(inode, ex); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 827fc9809bc2..9f88188060db 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -119,7 +119,7 @@ check_err: if (IS_ERR(inode)) { mlog_errno(PTR_ERR(inode)); - result = (void *)inode; + result = ERR_CAST(inode); goto bail; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 382401d3e88f..1a1e0078ab38 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -136,7 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { - int rc = 0; + int rc = -ESTALE; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 44d178b8d1aa..5bb4a89f9045 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -25,6 +25,8 @@ #ifndef _OCFS2_FS_H #define _OCFS2_FS_H +#include <linux/magic.h> + /* Version */ #define OCFS2_MAJOR_REV_LEVEL 0 #define OCFS2_MINOR_REV_LEVEL 90 @@ -56,9 +58,6 @@ #define OCFS2_MIN_BLOCKSIZE 512 #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE -/* Filesystem magic number */ -#define OCFS2_SUPER_MAGIC 0x7461636f - /* Object signatures */ #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" #define OCFS2_INODE_SIGNATURE "INODE01" diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 820359096c7a..d6c350ba25b9 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -631,7 +631,7 @@ static struct attribute *ocfs2_attrs[] = { NULL, }; -static struct attribute_group ocfs2_attr_group = { +static const struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ca1646fbcaef..83005f486451 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2062,7 +2062,7 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); - memcpy(sb->s_uuid, di->id2.i_super.s_uuid, + memcpy(&sb->s_uuid, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 3c5384d9b3a5..f70c3778d600 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1328,20 +1328,21 @@ static int ocfs2_xattr_get(struct inode *inode, void *buffer, size_t buffer_size) { - int ret; + int ret, had_lock; struct buffer_head *di_bh = NULL; + struct ocfs2_lock_holder oh; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - return ret; + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) { + mlog_errno(had_lock); + return had_lock; } down_read(&OCFS2_I(inode)->ip_xattr_sem); ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, name, buffer, buffer_size); up_read(&OCFS2_I(inode)->ip_xattr_sem); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -3537,11 +3538,12 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret, credits, ref_meta = 0, ref_credits = 0; + int ret, credits, had_lock, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, }; struct ocfs2_refcount_tree *ref_tree = NULL; + struct ocfs2_lock_holder oh; struct ocfs2_xattr_info xi = { .xi_name_index = name_index, @@ -3572,8 +3574,9 @@ int ocfs2_xattr_set(struct inode *inode, return -ENOMEM; } - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 1, &oh); + if (had_lock < 0) { + ret = had_lock; mlog_errno(ret); goto cleanup_nolock; } @@ -3670,7 +3673,7 @@ cleanup: if (ret) mlog_errno(ret); } - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); cleanup_nolock: brelse(di_bh); brelse(xbs.xattr_bh); diff --git a/fs/open.c b/fs/open.c index 4d23f729dcc6..35bb784763a4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -193,7 +193,8 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) goto out_putf; error = -EPERM; - if (IS_APPEND(inode)) + /* Check IS_APPEND on real upper inode */ + if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); @@ -459,20 +460,17 @@ out: SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); - struct inode *inode; - int error = -EBADF; + int error; error = -EBADF; if (!f.file) goto out; - inode = file_inode(f.file); - error = -ENOTDIR; - if (!S_ISDIR(inode->i_mode)) + if (!d_can_lookup(f.file->f_path.dentry)) goto out_putf; - error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); + error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: @@ -709,6 +707,9 @@ static int do_dentry_open(struct file *f, f->f_inode = inode; f->f_mapping = inode->i_mapping; + /* Ensure that we skip any errors that predate opening of the file */ + f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; @@ -761,6 +762,7 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); @@ -900,6 +902,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode = ACC_MODE(flags); + /* + * Clear out all open flags we don't know about so that we don't report + * them in fcntl(F_GETFD) or similar interfaces. + */ + flags &= VALID_OPEN_FLAGS; + if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; else diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 83b506020718..038d67545d9f 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -46,8 +46,8 @@ static void run_down(struct slot_map *m) spin_lock(&m->q.lock); if (m->c != -1) { for (;;) { - if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail(&m->q, &wait); + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail(&m->q, &wait); set_current_state(TASK_UNINTERRUPTIBLE); if (m->c == -1) @@ -84,8 +84,8 @@ static int wait_for_free(struct slot_map *m) do { long n = left, t; - if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail_exclusive(&m->q, &wait); + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail_exclusive(&m->q, &wait); set_current_state(TASK_INTERRUPTIBLE); if (m->c > 0) @@ -108,8 +108,8 @@ static int wait_for_free(struct slot_map *m) left = -EINTR; } while (left > 0); - if (!list_empty(&wait.task_list)) - list_del(&wait.task_list); + if (!list_empty(&wait.entry)) + list_del(&wait.entry); else if (left <= 0 && waitqueue_active(&m->q)) __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL); __set_current_state(TASK_RUNNING); diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 0daac5112f7a..c0c9683934b7 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,5 +1,6 @@ config OVERLAY_FS tristate "Overlay filesystem support" + select EXPORTFS help An overlay filesystem combines two filesystems - an 'upper' filesystem and a 'lower' filesystem. When a name exists in both filesystems, the diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 906ea6c93260..e5869f91b3ab 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -20,6 +20,7 @@ #include <linux/namei.h> #include <linux/fdtable.h> #include <linux/ratelimit.h> +#include <linux/exportfs.h> #include "overlayfs.h" #include "ovl_entry.h" @@ -232,6 +233,82 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } +static struct ovl_fh *ovl_encode_fh(struct dentry *lower, uuid_t *uuid) +{ + struct ovl_fh *fh; + int fh_type, fh_len, dwords; + void *buf; + int buflen = MAX_HANDLE_SZ; + + buf = kmalloc(buflen, GFP_TEMPORARY); + if (!buf) + return ERR_PTR(-ENOMEM); + + /* + * We encode a non-connectable file handle for non-dir, because we + * only need to find the lower inode number and we don't want to pay + * the price or reconnecting the dentry. + */ + dwords = buflen >> 2; + fh_type = exportfs_encode_fh(lower, buf, &dwords, 0); + buflen = (dwords << 2); + + fh = ERR_PTR(-EIO); + if (WARN_ON(fh_type < 0) || + WARN_ON(buflen > MAX_HANDLE_SZ) || + WARN_ON(fh_type == FILEID_INVALID)) + goto out; + + BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); + fh_len = offsetof(struct ovl_fh, fid) + buflen; + fh = kmalloc(fh_len, GFP_KERNEL); + if (!fh) { + fh = ERR_PTR(-ENOMEM); + goto out; + } + + fh->version = OVL_FH_VERSION; + fh->magic = OVL_FH_MAGIC; + fh->type = fh_type; + fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + fh->len = fh_len; + fh->uuid = *uuid; + memcpy(fh->fid, buf, buflen); + +out: + kfree(buf); + return fh; +} + +static int ovl_set_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper) +{ + struct super_block *sb = lower->d_sb; + const struct ovl_fh *fh = NULL; + int err; + + /* + * When lower layer doesn't support export operations store a 'null' fh, + * so we can use the overlay.origin xattr to distignuish between a copy + * up and a pure upper inode. + */ + if (sb->s_export_op && sb->s_export_op->fh_to_dentry && + !uuid_is_null(&sb->s_uuid)) { + fh = ovl_encode_fh(lower, &sb->s_uuid); + if (IS_ERR(fh)) + return PTR_ERR(fh); + } + + /* + * Do not fail when upper doesn't support xattrs. + */ + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, + fh ? fh->len : 0, 0); + kfree(fh); + + return err; +} + static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct dentry *dentry, struct path *lowerpath, struct kstat *stat, const char *link, @@ -252,15 +329,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, .link = link }; - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto out; - err = security_inode_copy_up(dentry, &new_creds); if (err < 0) - goto out1; + goto out; if (new_creds) old_creds = override_creds(new_creds); @@ -268,13 +339,14 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (tmpfile) temp = ovl_do_tmpfile(upperdir, stat->mode); else - temp = ovl_lookup_temp(workdir, dentry); - err = PTR_ERR(temp); - if (IS_ERR(temp)) - goto out1; - + temp = ovl_lookup_temp(workdir); err = 0; - if (!tmpfile) + if (IS_ERR(temp)) { + err = PTR_ERR(temp); + temp = NULL; + } + + if (!err && !tmpfile) err = ovl_create_real(wdir, temp, &cattr, NULL, true); if (new_creds) { @@ -283,7 +355,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, } if (err) - goto out2; + goto out; if (S_ISREG(stat->mode)) { struct path upperpath; @@ -316,6 +388,27 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (err) goto out_cleanup; + /* + * Store identifier of lower inode in upper inode xattr to + * allow lookup of the copy up origin inode. + * + * Don't set origin when we are breaking the association with a lower + * hard link. + */ + if (S_ISDIR(stat->mode) || stat->nlink == 1) { + err = ovl_set_origin(dentry, lowerpath->dentry, temp); + if (err) + goto out_cleanup; + } + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + if (IS_ERR(upper)) { + err = PTR_ERR(upper); + upper = NULL; + goto out_cleanup; + } + if (tmpfile) err = ovl_do_link(temp, udir, upper, true); else @@ -329,17 +422,15 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, /* Restore timestamps on parent (best effort) */ ovl_set_timestamps(upperdir, pstat); -out2: +out: dput(temp); -out1: dput(upper); -out: return err; out_cleanup: if (!tmpfile) ovl_cleanup(wdir, temp); - goto out2; + goto out; } /* @@ -372,6 +463,11 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; + /* Mark parent "impure" because it may now contain non-pure upper */ + err = ovl_set_impure(parent, upperdir); + if (err) + return err; + err = vfs_getattr(&parentpath, &pstat, STATX_ATIME | STATX_MTIME, AT_STATX_SYNC_AS_STAT); if (err) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 6515796460df..a63a71656e9b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -41,7 +41,7 @@ void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) } } -struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) +struct dentry *ovl_lookup_temp(struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -68,7 +68,7 @@ static struct dentry *ovl_whiteout(struct dentry *workdir, struct dentry *whiteout; struct inode *wdir = workdir->d_inode; - whiteout = ovl_lookup_temp(workdir, dentry); + whiteout = ovl_lookup_temp(workdir); if (IS_ERR(whiteout)) return whiteout; @@ -127,45 +127,26 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, return err; } -static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) +static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, + int xerr) { int err; - err = ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); if (!err) ovl_dentry_set_opaque(dentry); return err; } -static int ovl_dir_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) { - struct dentry *dentry = path->dentry; - int err; - enum ovl_path_type type; - struct path realpath; - const struct cred *old_cred; - - type = ovl_path_real(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&realpath, stat, request_mask, flags); - revert_creds(old_cred); - if (err) - return err; - - stat->dev = dentry->d_sb->s_dev; - stat->ino = dentry->d_inode->i_ino; - /* - * It's probably not worth it to count subdirs to get the - * correct link count. nlink=1 seems to pacify 'find' and - * other utilities. + * Fail with -EIO when trying to create opaque dir and upper doesn't + * support xattrs. ovl_rename() calls ovl_set_opaque_xerr(-EXDEV) to + * return a specific error for noxattr case. */ - if (OVL_TYPE_MERGE(type)) - stat->nlink = 1; - - return 0; + return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); } /* Common operations required to be done after creation of file on upper */ @@ -182,6 +163,9 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode, inc_nlink(inode); } d_instantiate(dentry, inode); + /* Force lookup of new upper hardlink to find its lower */ + if (hardlink) + d_drop(dentry); } static bool ovl_type_merge(struct dentry *dentry) @@ -189,6 +173,11 @@ static bool ovl_type_merge(struct dentry *dentry) return OVL_TYPE_MERGE(ovl_path_type(dentry)); } +static bool ovl_type_origin(struct dentry *dentry) +{ + return OVL_TYPE_ORIGIN(ovl_path_type(dentry)); +} + static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct cattr *attr, struct dentry *hardlink) { @@ -210,7 +199,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, if (err) goto out_dput; - if (ovl_type_merge(dentry->d_parent)) { + if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { /* Setting opaque here is just an optimization, allow to fail */ ovl_set_opaque(dentry, newdentry); } @@ -277,7 +266,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (upper->d_parent->d_inode != udir) goto out_unlock; - opaquedir = ovl_lookup_temp(workdir, dentry); + opaquedir = ovl_lookup_temp(workdir); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out_unlock; @@ -409,7 +398,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out; - newdentry = ovl_lookup_temp(workdir, dentry); + newdentry = ovl_lookup_temp(workdir); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_unlock; @@ -873,18 +862,16 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) if (IS_ERR(redirect)) return PTR_ERR(redirect); - err = ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT, - redirect, strlen(redirect), 0); + err = ovl_check_setxattr(dentry, ovl_dentry_upper(dentry), + OVL_XATTR_REDIRECT, + redirect, strlen(redirect), -EXDEV); if (!err) { spin_lock(&dentry->d_lock); ovl_dentry_set_redirect(dentry, redirect); spin_unlock(&dentry->d_lock); } else { kfree(redirect); - if (err == -EOPNOTSUPP) - ovl_clear_redirect_dir(dentry->d_sb); - else - pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); + pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); /* Fall back to userspace copy-up */ err = -EXDEV; } @@ -970,6 +957,25 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, old_upperdir = ovl_dentry_upper(old->d_parent); new_upperdir = ovl_dentry_upper(new->d_parent); + if (!samedir) { + /* + * When moving a merge dir or non-dir with copy up origin into + * a new parent, we are marking the new parent dir "impure". + * When ovl_iterate() iterates an "impure" upper dir, it will + * lookup the origin inodes of the entries to fill d_ino. + */ + if (ovl_type_origin(old)) { + err = ovl_set_impure(new->d_parent, new_upperdir); + if (err) + goto out_revert_creds; + } + if (!overwrite && ovl_type_origin(new)) { + err = ovl_set_impure(old->d_parent, old_upperdir); + if (err) + goto out_revert_creds; + } + } + trap = lock_rename(new_upperdir, old_upperdir); olddentry = lookup_one_len(old->d_name.name, old_upperdir, @@ -1019,7 +1025,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (ovl_type_merge_or_lower(old)) err = ovl_set_redirect(old, samedir); else if (!old_opaque && ovl_type_merge(new->d_parent)) - err = ovl_set_opaque(old, olddentry); + err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); if (err) goto out_dput; } @@ -1027,7 +1033,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (ovl_type_merge_or_lower(new)) err = ovl_set_redirect(new, samedir); else if (!new_opaque && ovl_type_merge(old->d_parent)) - err = ovl_set_opaque(new, newdentry); + err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); if (err) goto out_dput; } @@ -1070,7 +1076,7 @@ const struct inode_operations ovl_dir_inode_operations = { .create = ovl_create, .mknod = ovl_mknod, .permission = ovl_permission, - .getattr = ovl_dir_getattr, + .getattr = ovl_getattr, .listxattr = ovl_listxattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index f8fe6bf2036d..d613e2c41242 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -57,18 +57,78 @@ out: return err; } -static int ovl_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; + enum ovl_path_type type; struct path realpath; const struct cred *old_cred; + bool is_dir = S_ISDIR(dentry->d_inode->i_mode); int err; - ovl_path_real(dentry, &realpath); + type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getattr(&realpath, stat, request_mask, flags); + if (err) + goto out; + + /* + * When all layers are on the same fs, all real inode number are + * unique, so we use the overlay st_dev, which is friendly to du -x. + * + * We also use st_ino of the copy up origin, if we know it. + * This guaranties constant st_dev/st_ino across copy up. + * + * If filesystem supports NFS export ops, this also guaranties + * persistent st_ino across mount cycle. + */ + if (ovl_same_sb(dentry->d_sb)) { + if (OVL_TYPE_ORIGIN(type)) { + struct kstat lowerstat; + u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); + + ovl_path_lower(dentry, &realpath); + err = vfs_getattr(&realpath, &lowerstat, + lowermask, flags); + if (err) + goto out; + + WARN_ON_ONCE(stat->dev != lowerstat.dev); + /* + * Lower hardlinks are broken on copy up to different + * upper files, so we cannot use the lower origin st_ino + * for those different files, even for the same fs case. + */ + if (is_dir || lowerstat.nlink == 1) + stat->ino = lowerstat.ino; + } + stat->dev = dentry->d_sb->s_dev; + } else if (is_dir) { + /* + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino. + * + * Always use the overlay st_dev for directories, so 'find + * -xdev' will scan the entire overlay mount and won't cross the + * overlay mount boundaries. + */ + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + } + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (is_dir && OVL_TYPE_MERGE(type)) + stat->nlink = 1; + +out: revert_creds(old_cred); + return err; } @@ -180,6 +240,16 @@ int ovl_xattr_get(struct dentry *dentry, const char *name, return res; } +static bool ovl_can_list(const char *s) +{ + /* List all non-trusted xatts */ + if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) + return true; + + /* Never list trusted.overlay, list other trusted for superuser only */ + return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN); +} + ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); @@ -203,7 +273,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return -EIO; len -= slen; - if (ovl_is_private_xattr(s)) { + if (!ovl_can_list(s)) { res -= slen; memmove(s, s + slen, len); } else { @@ -303,6 +373,41 @@ static const struct inode_operations ovl_symlink_inode_operations = { .update_time = ovl_update_time, }; +/* + * It is possible to stack overlayfs instance on top of another + * overlayfs instance as lower layer. We need to annonate the + * stackable i_mutex locks according to stack level of the super + * block instance. An overlayfs instance can never be in stack + * depth 0 (there is always a real fs below it). An overlayfs + * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth]. + * + * For example, here is a snip from /proc/lockdep_chains after + * dir_iterate of nested overlayfs: + * + * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) + * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) + * [...] &type->i_mutex_dir_key (stack_depth=0) + */ +#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH + +static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) +{ +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; + + int depth = inode->i_sb->s_stack_depth - 1; + + if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) + depth = 0; + + if (S_ISDIR(inode->i_mode)) + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); + else + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); +#endif +} + static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_ino = get_next_ino(); @@ -312,6 +417,8 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; #endif + ovl_lockdep_annotate_inode_mutex_key(inode); + switch (mode & S_IFMT) { case S_IFREG: inode->i_op = &ovl_file_inode_operations; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index b8b077821fb0..de0d4f742f36 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -12,6 +12,8 @@ #include <linux/namei.h> #include <linux/xattr.h> #include <linux/ratelimit.h> +#include <linux/mount.h> +#include <linux/exportfs.h> #include "overlayfs.h" #include "ovl_entry.h" @@ -81,19 +83,93 @@ invalid: goto err_free; } -static bool ovl_is_opaquedir(struct dentry *dentry) +static int ovl_acceptable(void *ctx, struct dentry *dentry) +{ + return 1; +} + +static struct dentry *ovl_get_origin(struct dentry *dentry, + struct vfsmount *mnt) { int res; - char val; + struct ovl_fh *fh = NULL; + struct dentry *origin = NULL; + int bytes; - if (!d_is_dir(dentry)) - return false; + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return NULL; + goto fail; + } + /* Zero size value means "copied up but origin unknown" */ + if (res == 0) + return NULL; - res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); - if (res == 1 && val == 'y') - return true; + fh = kzalloc(res, GFP_TEMPORARY); + if (!fh) + return ERR_PTR(-ENOMEM); + + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, fh, res); + if (res < 0) + goto fail; + + if (res < sizeof(struct ovl_fh) || res < fh->len) + goto invalid; + + if (fh->magic != OVL_FH_MAGIC) + goto invalid; + + /* Treat larger version and unknown flags as "origin unknown" */ + if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + goto out; + + /* Treat endianness mismatch as "origin unknown" */ + if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + goto out; + + bytes = (fh->len - offsetof(struct ovl_fh, fid)); + + /* + * Make sure that the stored uuid matches the uuid of the lower + * layer where file handle will be decoded. + */ + if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) + goto out; - return false; + origin = exportfs_decode_fh(mnt, (struct fid *)fh->fid, + bytes >> 2, (int)fh->type, + ovl_acceptable, NULL); + if (IS_ERR(origin)) { + /* Treat stale file handle as "origin unknown" */ + if (origin == ERR_PTR(-ESTALE)) + origin = NULL; + goto out; + } + + if (ovl_dentry_weird(origin) || + ((d_inode(origin)->i_mode ^ d_inode(dentry)->i_mode) & S_IFMT)) { + dput(origin); + origin = NULL; + goto invalid; + } + +out: + kfree(fh); + return origin; + +fail: + pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res); + goto out; +invalid: + pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh); + goto out; +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE); } static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, @@ -192,6 +268,45 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, return 0; } + +static int ovl_check_origin(struct dentry *dentry, struct dentry *upperdentry, + struct path **stackp, unsigned int *ctrp) +{ + struct super_block *same_sb = ovl_same_sb(dentry->d_sb); + struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; + struct vfsmount *mnt; + struct dentry *origin; + + if (!same_sb || !roe->numlower) + return 0; + + /* + * Since all layers are on the same fs, we use the first layer for + * decoding the file handle. We may get a disconnected dentry, + * which is fine, because we only need to hold the origin inode in + * cache and use its inode number. We may even get a connected dentry, + * that is not under the first layer's root. That is also fine for + * using it's inode number - it's the same as if we held a reference + * to a dentry in first layer that was moved under us. + */ + mnt = roe->lowerstack[0].mnt; + + origin = ovl_get_origin(upperdentry, mnt); + if (IS_ERR_OR_NULL(origin)) + return PTR_ERR(origin); + + BUG_ON(*stackp || *ctrp); + *stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY); + if (!*stackp) { + dput(origin); + return -ENOMEM; + } + **stackp = (struct path) { .dentry = origin, .mnt = mnt }; + *ctrp = 1; + + return 0; +} + /* * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. @@ -220,11 +335,13 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, const struct cred *old_cred; struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; struct path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; unsigned int ctr = 0; struct inode *inode = NULL; bool upperopaque = false; + bool upperimpure = false; char *upperredirect = NULL; struct dentry *this; unsigned int i; @@ -253,15 +370,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = -EREMOTE; goto out; } + if (upperdentry && !d.is_dir) { + BUG_ON(!d.stop || d.redirect); + err = ovl_check_origin(dentry, upperdentry, + &stack, &ctr); + if (err) + goto out; + } if (d.redirect) { upperredirect = kstrdup(d.redirect, GFP_KERNEL); if (!upperredirect) goto out_put_upper; if (d.redirect[0] == '/') - poe = dentry->d_sb->s_root->d_fsdata; + poe = roe; } upperopaque = d.opaque; + if (upperdentry && d.is_dir) + upperimpure = ovl_is_impuredir(upperdentry); } if (!d.stop && poe->numlower) { @@ -290,10 +416,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.stop) break; - if (d.redirect && - d.redirect[0] == '/' && - poe != dentry->d_sb->s_root->d_fsdata) { - poe = dentry->d_sb->s_root->d_fsdata; + if (d.redirect && d.redirect[0] == '/' && poe != roe) { + poe = roe; /* Find the current layer on the root dentry */ for (i = 0; i < poe->numlower; i++) @@ -332,6 +456,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, revert_creds(old_cred); oe->opaque = upperopaque; + oe->impure = upperimpure; oe->redirect = upperredirect; oe->__upperdentry = upperdentry; memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 741dc0b6931f..10863b4105fa 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -8,18 +8,57 @@ */ #include <linux/kernel.h> +#include <linux/uuid.h> enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), __OVL_PATH_MERGE = (1 << 1), + __OVL_PATH_ORIGIN = (1 << 2), }; #define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) #define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) #define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." #define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" #define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect" +#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin" +#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure" + +/* + * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, + * where: + * origin.fh - exported file handle of the lower file + * origin.uuid - uuid of the lower filesystem + */ +#define OVL_FH_VERSION 0 +#define OVL_FH_MAGIC 0xfb + +/* CPU byte order required for fid decoding: */ +#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0) +#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1) + +#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN) + +#if defined(__LITTLE_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN 0 +#elif defined(__BIG_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN +#else +#error Endianness not defined +#endif + +/* On-disk and in-memeory format for redirect by file handle */ +struct ovl_fh { + u8 version; /* 0 */ + u8 magic; /* 0xfb */ + u8 len; /* size of this header + size of fid */ + u8 flags; /* OVL_FH_FLAG_* */ + u8 type; /* fid_type of fid */ + uuid_t uuid; /* uuid of filesystem */ + u8 fid[0]; /* file identifier */ +} __packed; #define OVL_ISUPPER_MASK 1UL @@ -151,6 +190,7 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); +struct super_block *ovl_same_sb(struct super_block *sb); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); bool ovl_dentry_remote(struct dentry *dentry); bool ovl_dentry_weird(struct dentry *dentry); @@ -164,10 +204,10 @@ struct dentry *ovl_dentry_real(struct dentry *dentry); struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); bool ovl_dentry_is_opaque(struct dentry *dentry); +bool ovl_dentry_is_impure(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); bool ovl_redirect_dir(struct super_block *sb); -void ovl_clear_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); @@ -180,6 +220,17 @@ bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry); void ovl_copy_up_end(struct dentry *dentry); +bool ovl_check_dir_xattr(struct dentry *dentry, const char *name); +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr); +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); + +static inline bool ovl_is_impuredir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); +} + /* namei.c */ int ovl_path_next(int idx, struct dentry *dentry, struct path *path); @@ -197,6 +248,8 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); int ovl_permission(struct inode *inode, int mask); int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); @@ -222,7 +275,7 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +struct dentry *ovl_lookup_temp(struct dentry *workdir); struct cattr { dev_t rdev; umode_t mode; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 59614faa14c3..34bc4a9f5c61 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -28,7 +28,10 @@ struct ovl_fs { /* creds of process who forced instantiation of super block */ const struct cred *creator_cred; bool tmpfile; + bool noxattr; wait_queue_head_t copyup_wq; + /* sb common to all layers */ + struct super_block *same_sb; }; /* private information held for every overlayfs dentry */ @@ -40,6 +43,7 @@ struct ovl_entry { u64 version; const char *redirect; bool opaque; + bool impure; bool copying; }; struct rcu_head rcu; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index c9e70d39c1ea..4882ffb37bae 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -49,11 +49,28 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static int ovl_check_append_only(struct inode *inode, int flag) +{ + /* + * This test was moot in vfs may_open() because overlay inode does + * not have the S_APPEND flag, so re-check on real upper inode + */ + if (IS_APPEND(inode)) { + if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) + return -EPERM; + if (flag & O_TRUNC) + return -EPERM; + } + + return 0; +} + static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode, unsigned int open_flags) { struct dentry *real; + int err; if (!d_is_reg(dentry)) { if (!inode || inode == d_inode(dentry)) @@ -65,15 +82,20 @@ static struct dentry *ovl_d_real(struct dentry *dentry, return dentry; if (open_flags) { - int err = ovl_open_maybe_copy_up(dentry, open_flags); - + err = ovl_open_maybe_copy_up(dentry, open_flags); if (err) return ERR_PTR(err); } real = ovl_dentry_upper(dentry); - if (real && (!inode || inode == d_inode(real))) + if (real && (!inode || inode == d_inode(real))) { + if (!inode) { + err = ovl_check_append_only(d_inode(real), open_flags); + if (err) + return ERR_PTR(err); + } return real; + } real = ovl_dentry_lower(dentry); if (!real) @@ -709,8 +731,8 @@ static const struct xattr_handler *ovl_xattr_handlers[] = { static int ovl_fill_super(struct super_block *sb, void *data, int silent) { - struct path upperpath = { NULL, NULL }; - struct path workpath = { NULL, NULL }; + struct path upperpath = { }; + struct path workpath = { }; struct dentry *root_dentry; struct inode *realinode; struct ovl_entry *oe; @@ -869,6 +891,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) dput(temp); else pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + + /* + * Check if upper/work fs supports trusted.overlay.* + * xattr + */ + err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE, + "0", 1, 0); + if (err) { + ufs->noxattr = true; + pr_warn("overlayfs: upper fs does not support xattr.\n"); + } else { + vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE); + } } } @@ -892,11 +927,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->lower_mnt[ufs->numlower] = mnt; ufs->numlower++; + + /* Check if all lower layers are on same sb */ + if (i == 0) + ufs->same_sb = mnt->mnt_sb; + else if (ufs->same_sb != mnt->mnt_sb) + ufs->same_sb = NULL; } /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; + else if (ufs->upper_mnt->mnt_sb != ufs->same_sb) + ufs->same_sb = NULL; if (remote) sb->s_d_op = &ovl_reval_dentry_operations; @@ -931,7 +974,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) path_put(&workpath); kfree(lowertmp); - oe->__upperdentry = upperpath.dentry; + if (upperpath.dentry) { + oe->__upperdentry = upperpath.dentry; + oe->impure = ovl_is_impuredir(upperpath.dentry); + } for (i = 0; i < numlower; i++) { oe->lowerstack[i].dentry = stack[i].dentry; oe->lowerstack[i].mnt = ufs->lower_mnt[i]; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 6e610a205e15..809048913889 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -40,6 +40,13 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } +struct super_block *ovl_same_sb(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->same_sb; +} + struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); @@ -75,11 +82,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) type = __OVL_PATH_UPPER; /* - * Non-dir dentry can hold lower dentry from previous - * location. + * Non-dir dentry can hold lower dentry of its copy up origin. */ - if (oe->numlower && d_is_dir(dentry)) - type |= __OVL_PATH_MERGE; + if (oe->numlower) { + type |= __OVL_PATH_ORIGIN; + if (d_is_dir(dentry)) + type |= __OVL_PATH_MERGE; + } } else { if (oe->numlower > 1) type |= __OVL_PATH_MERGE; @@ -100,7 +109,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path) { struct ovl_entry *oe = dentry->d_fsdata; - *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; + *path = oe->numlower ? oe->lowerstack[0] : (struct path) { }; } enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) @@ -166,6 +175,13 @@ bool ovl_dentry_is_opaque(struct dentry *dentry) return oe->opaque; } +bool ovl_dentry_is_impure(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->impure; +} + bool ovl_dentry_is_whiteout(struct dentry *dentry) { return !dentry->d_inode && ovl_dentry_is_opaque(dentry); @@ -182,14 +198,7 @@ bool ovl_redirect_dir(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; - return ofs->config.redirect_dir; -} - -void ovl_clear_redirect_dir(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - ofs->config.redirect_dir = false; + return ofs->config.redirect_dir && !ofs->noxattr; } const char *ovl_dentry_get_redirect(struct dentry *dentry) @@ -294,3 +303,59 @@ void ovl_copy_up_end(struct dentry *dentry) wake_up_locked(&ofs->copyup_wq); spin_unlock(&ofs->copyup_wq.lock); } + +bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) +{ + int res; + char val; + + if (!d_is_dir(dentry)) + return false; + + res = vfs_getxattr(dentry, name, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr) +{ + int err; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + if (ofs->noxattr) + return xerr; + + err = ovl_do_setxattr(upperdentry, name, value, size, 0); + + if (err == -EOPNOTSUPP) { + pr_warn("overlayfs: cannot set %s xattr on upper\n", name); + ofs->noxattr = true; + return xerr; + } + + return err; +} + +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) +{ + int err; + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->impure) + return 0; + + /* + * Do not fail when upper doesn't support xattrs. + * Upper inodes won't have origin nor redirect xattr anyway. + */ + err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE, + "y", 1, 0); + if (!err) + oe->impure = true; + + return err; +} diff --git a/fs/pnode.c b/fs/pnode.c index 5bc7896d122a..53d411a371ce 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p) return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } +static inline struct mount *last_slave(struct mount *p) +{ + return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); +} + static inline struct mount *next_slave(struct mount *p) { return list_entry(p->mnt_slave.next, struct mount, mnt_slave); @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m, } } +static struct mount *skip_propagation_subtree(struct mount *m, + struct mount *origin) +{ + /* + * Advance m such that propagation_next will not return + * the slaves of m. + */ + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + m = last_slave(m); + + return m; +} + static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { @@ -413,65 +431,104 @@ void propagate_mount_unlock(struct mount *mnt) } } -/* - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. - */ -static void mark_umount_candidates(struct mount *mnt) +static void umount_one(struct mount *mnt, struct list_head *to_umount) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; - - BUG_ON(parent == mnt); - - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint); - if (!child || (child->mnt.mnt_flags & MNT_UMOUNT)) - continue; - if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) { - SET_MNT_MARK(child); - } - } + CLEAR_MNT_MARK(mnt); + mnt->mnt.mnt_flags |= MNT_UMOUNT; + list_del_init(&mnt->mnt_child); + list_del_init(&mnt->mnt_umounting); + list_move_tail(&mnt->mnt_list, to_umount); } /* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ -static void __propagate_umount(struct mount *mnt) +static bool __propagate_umount(struct mount *mnt, + struct list_head *to_umount, + struct list_head *to_restore) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; + bool progress = false; + struct mount *child; - BUG_ON(parent == mnt); + /* + * The state of the parent won't change if this mount is + * already unmounted or marked as without children. + */ + if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) + goto out; - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *topper; - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint); - /* - * umount the child only if the child has no children - * and the child is marked safe to unmount. - */ - if (!child || !IS_MNT_MARKED(child)) + /* Verify topper is the only grandchild that has not been + * speculatively unmounted. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (child->mnt_mountpoint == mnt->mnt.mnt_root) continue; - CLEAR_MNT_MARK(child); + if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) + continue; + /* Found a mounted child */ + goto children; + } - /* If there is exactly one mount covering all of child - * replace child with that mount. - */ - topper = find_topper(child); - if (topper) - mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, - topper); + /* Mark mounts that can be unmounted if not locked */ + SET_MNT_MARK(mnt); + progress = true; + + /* If a mount is without children and not locked umount it. */ + if (!IS_MNT_LOCKED(mnt)) { + umount_one(mnt, to_umount); + } else { +children: + list_move_tail(&mnt->mnt_umounting, to_restore); + } +out: + return progress; +} + +static void umount_list(struct list_head *to_umount, + struct list_head *to_restore) +{ + struct mount *mnt, *child, *tmp; + list_for_each_entry(mnt, to_umount, mnt_list) { + list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { + /* topper? */ + if (child->mnt_mountpoint == mnt->mnt.mnt_root) + list_move_tail(&child->mnt_umounting, to_restore); + else + umount_one(child, to_umount); + } + } +} - if (list_empty(&child->mnt_mounts)) { - list_del_init(&child->mnt_child); - child->mnt.mnt_flags |= MNT_UMOUNT; - list_move_tail(&child->mnt_list, &mnt->mnt_list); +static void restore_mounts(struct list_head *to_restore) +{ + /* Restore mounts to a clean working state */ + while (!list_empty(to_restore)) { + struct mount *mnt, *parent; + struct mountpoint *mp; + + mnt = list_first_entry(to_restore, struct mount, mnt_umounting); + CLEAR_MNT_MARK(mnt); + list_del_init(&mnt->mnt_umounting); + + /* Should this mount be reparented? */ + mp = mnt->mnt_mp; + parent = mnt->mnt_parent; + while (parent->mnt.mnt_flags & MNT_UMOUNT) { + mp = parent->mnt_mp; + parent = parent->mnt_parent; } + if (parent != mnt->mnt_parent) + mnt_change_mountpoint(parent, mp, mnt); + } +} + +static void cleanup_umount_visitations(struct list_head *visited) +{ + while (!list_empty(visited)) { + struct mount *mnt = + list_first_entry(visited, struct mount, mnt_umounting); + list_del_init(&mnt->mnt_umounting); } } @@ -485,11 +542,68 @@ static void __propagate_umount(struct mount *mnt) int propagate_umount(struct list_head *list) { struct mount *mnt; + LIST_HEAD(to_restore); + LIST_HEAD(to_umount); + LIST_HEAD(visited); + + /* Find candidates for unmounting */ + list_for_each_entry_reverse(mnt, list, mnt_list) { + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + /* + * If this mount has already been visited it is known that it's + * entire peer group and all of their slaves in the propagation + * tree for the mountpoint has already been visited and there is + * no need to visit them again. + */ + if (!list_empty(&mnt->mnt_umounting)) + continue; + + list_add_tail(&mnt->mnt_umounting, &visited); + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt(&m->mnt, + mnt->mnt_mountpoint); + if (!child) + continue; + + if (!list_empty(&child->mnt_umounting)) { + /* + * If the child has already been visited it is + * know that it's entire peer group and all of + * their slaves in the propgation tree for the + * mountpoint has already been visited and there + * is no need to visit this subtree again. + */ + m = skip_propagation_subtree(m, parent); + continue; + } else if (child->mnt.mnt_flags & MNT_UMOUNT) { + /* + * We have come accross an partially unmounted + * mount in list that has not been visited yet. + * Remember it has been visited and continue + * about our merry way. + */ + list_add_tail(&child->mnt_umounting, &visited); + continue; + } + + /* Check the child and parents while progress is made */ + while (__propagate_umount(child, + &to_umount, &to_restore)) { + /* Is the parent a umount candidate? */ + child = child->mnt_parent; + if (list_empty(&child->mnt_umounting)) + break; + } + } + } - list_for_each_entry_reverse(mnt, list, mnt_list) - mark_umount_candidates(mnt); + umount_list(&to_umount, &to_restore); + restore_mounts(&to_restore); + cleanup_umount_visitations(&visited); + list_splice_tail(&to_umount, list); - list_for_each_entry(mnt, list, mnt_list) - __propagate_umount(mnt); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e3ac5c11780..f1e1927ccd48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -821,10 +821,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - /* Maybe we should limit FOLL_FORCE to actual ptrace users? */ - flags = FOLL_FORCE; - if (write) - flags |= FOLL_WRITE; + flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2cc7a8030275..e250910cffc8 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) struct proc_inode *ei; struct inode *inode; - ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4ee55274f155..45629f4b5402 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -504,7 +504,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (&m->list == &kclist_head) { if (clear_user(buffer, tsz)) return -EFAULT; - } else if (is_vmalloc_or_module_addr((void *)start)) { + } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ if (copy_to_user(buffer, buf, tsz)) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 766f0c637ad1..3803b24ca220 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = { #endif #ifdef CONFIG_PID_NS &pidns_operations, + &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0c8b33d99b1..520802da059c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -300,11 +300,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; - if (stack_guard_page_start(vma, start)) - start += PAGE_SIZE; end = vma->vm_end; - if (stack_guard_page_end(vma, end)) - end -= PAGE_SIZE; seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 792a4e5f9226..4d02c3b65061 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -349,48 +349,48 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) switch (record->type) { case PSTORE_TYPE_DMESG: - scnprintf(name, sizeof(name), "dmesg-%s-%lld%s", + scnprintf(name, sizeof(name), "dmesg-%s-%llu%s", record->psi->name, record->id, record->compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: - scnprintf(name, sizeof(name), "console-%s-%lld", + scnprintf(name, sizeof(name), "console-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_FTRACE: - scnprintf(name, sizeof(name), "ftrace-%s-%lld", + scnprintf(name, sizeof(name), "ftrace-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_MCE: - scnprintf(name, sizeof(name), "mce-%s-%lld", + scnprintf(name, sizeof(name), "mce-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_RTAS: - scnprintf(name, sizeof(name), "rtas-%s-%lld", + scnprintf(name, sizeof(name), "rtas-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OF: - scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_COMMON: - scnprintf(name, sizeof(name), "powerpc-common-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-common-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PMSG: - scnprintf(name, sizeof(name), "pmsg-%s-%lld", + scnprintf(name, sizeof(name), "pmsg-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OPAL: - scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_UNKNOWN: - scnprintf(name, sizeof(name), "unknown-%s-%lld", + scnprintf(name, sizeof(name), "unknown-%s-%llu", record->psi->name, record->id); break; default: - scnprintf(name, sizeof(name), "type%d-%s-%lld", + scnprintf(name, sizeof(name), "type%d-%s-%llu", record->type, record->psi->name, record->id); break; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index c416e653dc4f..58051265626f 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -30,5 +30,7 @@ extern void pstore_get_backend_records(struct pstore_info *psi, extern int pstore_mkfile(struct dentry *root, struct pstore_record *record); extern bool pstore_is_mounted(void); +extern void pstore_record_init(struct pstore_record *record, + struct pstore_info *psi); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index d468eec9b8a6..1b6e0ff6bff5 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -474,6 +474,20 @@ static size_t copy_kmsg_to_buffer(int hsize, size_t len) return total_len; } +void pstore_record_init(struct pstore_record *record, + struct pstore_info *psinfo) +{ + memset(record, 0, sizeof(*record)); + + record->psi = psinfo; + + /* Report zeroed timestamp if called before timekeeping has resumed. */ + if (__getnstimeofday(&record->time)) { + record->time.tv_sec = 0; + record->time.tv_nsec = 0; + } +} + /* * callback from kmsg_dump. (s2,l2) has the most recently * written bytes, older bytes are in (s1,l1). Save as much @@ -509,15 +523,14 @@ static void pstore_dump(struct kmsg_dumper *dumper, int header_size; int zipped_len = -1; size_t dump_size; - struct pstore_record record = { - .type = PSTORE_TYPE_DMESG, - .count = oopscount, - .reason = reason, - .part = part, - .compressed = false, - .buf = psinfo->buf, - .psi = psinfo, - }; + struct pstore_record record; + + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_DMESG; + record.count = oopscount; + record.reason = reason; + record.part = part; + record.buf = psinfo->buf; if (big_oops_buf && is_locked) { dst = big_oops_buf; @@ -587,12 +600,12 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) const char *e = s + c; while (s < e) { - struct pstore_record record = { - .type = PSTORE_TYPE_CONSOLE, - .psi = psinfo, - }; + struct pstore_record record; unsigned long flags; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_CONSOLE; + if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -640,19 +653,16 @@ static int pstore_write_user_compat(struct pstore_record *record, if (record->buf) return -EINVAL; - record->buf = kmalloc(record->size, GFP_KERNEL); - if (!record->buf) - return -ENOMEM; - - if (unlikely(copy_from_user(record->buf, buf, record->size))) { - ret = -EFAULT; + record->buf = memdup_user(buf, record->size); + if (unlikely(IS_ERR(record->buf))) { + ret = PTR_ERR(record->buf); goto out; } ret = record->psi->write(record); -out: kfree(record->buf); +out: record->buf = NULL; return unlikely(ret < 0) ? ret : record->size; @@ -770,8 +780,11 @@ static void decompress_record(struct pstore_record *record) int unzipped_len; char *decompressed; + if (!record->compressed) + return; + /* Only PSTORE_TYPE_DMESG support compression. */ - if (!record->compressed || record->type != PSTORE_TYPE_DMESG) { + if (record->type != PSTORE_TYPE_DMESG) { pr_warn("ignored compressed record type %d\n", record->type); return; } @@ -819,6 +832,7 @@ void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet) { int failed = 0; + unsigned int stop_loop = 65536; if (!psi || !root) return; @@ -832,7 +846,7 @@ void pstore_get_backend_records(struct pstore_info *psi, * may reallocate record.buf. On success, pstore_mkfile() will keep * the record.buf, so free it only on failure. */ - for (;;) { + for (; stop_loop; stop_loop--) { struct pstore_record *record; int rc; @@ -841,13 +855,15 @@ void pstore_get_backend_records(struct pstore_info *psi, pr_err("out of memory creating record\n"); break; } - record->psi = psi; + pstore_record_init(record, psi); record->size = psi->read(record); /* No more records left in backend? */ - if (record->size <= 0) + if (record->size <= 0) { + kfree(record); break; + } decompress_record(record); rc = pstore_mkfile(root, record); @@ -865,8 +881,11 @@ out: mutex_unlock(&psi->read_mutex); if (failed) - pr_warn("failed to load %d record(s) from '%s'\n", + pr_warn("failed to create %d record(s) from '%s'\n", failed, psi->name); + if (!stop_loop) + pr_err("looping? Too many records seen from '%s'\n", + psi->name); } static void pstore_dowork(struct work_struct *work) diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 209755e0d7c8..24db02de1787 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -22,16 +22,16 @@ static DEFINE_MUTEX(pmsg_lock); static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct pstore_record record = { - .type = PSTORE_TYPE_PMSG, - .size = count, - .psi = psinfo, - }; + struct pstore_record record; int ret; if (!count) return 0; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_PMSG; + record.size = count; + /* check outside lock, page in any data. write_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 5523df7f17ef..7125b398d312 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -27,7 +27,6 @@ #include <linux/module.h> #include <linux/version.h> #include <linux/pstore.h> -#include <linux/time.h> #include <linux/io.h> #include <linux/ioport.h> #include <linux/platform_device.h> @@ -58,7 +57,7 @@ module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400); MODULE_PARM_DESC(pmsg_size, "size of user space message log"); static unsigned long long mem_address; -module_param(mem_address, ullong, 0400); +module_param_hw(mem_address, ullong, other, 0400); MODULE_PARM_DESC(mem_address, "start of reserved RAM used to store oops/panic logs"); @@ -356,20 +355,15 @@ out: } static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, - bool compressed) + struct pstore_record *record) { char *hdr; - struct timespec timestamp; size_t len; - /* Report zeroed timestamp if called before timekeeping has resumed. */ - if (__getnstimeofday(×tamp)) { - timestamp.tv_sec = 0; - timestamp.tv_nsec = 0; - } hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", - (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000), - compressed ? 'C' : 'D'); + record->time.tv_sec, + record->time.tv_nsec / 1000, + record->compressed ? 'C' : 'D'); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); @@ -440,7 +434,7 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) prz = cxt->dprzs[cxt->dump_write_cnt]; /* Build header and append record contents. */ - hlen = ramoops_write_kmsg_hdr(prz, record->compressed); + hlen = ramoops_write_kmsg_hdr(prz, record); size = record->size; if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ebf80c7739e1..53a17496c5c5 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1512,6 +1512,22 @@ int dquot_initialize(struct inode *inode) } EXPORT_SYMBOL(dquot_initialize); +bool dquot_initialize_needed(struct inode *inode) +{ + struct dquot **dquots; + int i; + + if (!dquot_active(inode)) + return false; + + dquots = i_dquot(inode); + for (i = 0; i < MAXQUOTAS; i++) + if (!dquots[i] && sb_has_quota_active(inode->i_sb, i)) + return true; + return false; +} +EXPORT_SYMBOL(dquot_initialize_needed); + /* * Release all quotas referenced by inode. * @@ -1894,6 +1910,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { qsize_t space, cur_space; qsize_t rsv_space = 0; + qsize_t inode_usage = 1; struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, ret = 0; char is_valid[MAXQUOTAS] = {}; @@ -1903,6 +1920,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) if (IS_NOQUOTA(inode)) return 0; + + if (inode->i_sb->dq_op->get_inode_usage) { + ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage); + if (ret) + return ret; + } + /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { warn_to[cnt].w_type = QUOTA_NL_NOWARN; @@ -1930,7 +1954,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; is_valid[cnt] = 1; transfer_from[cnt] = i_dquot(inode)[cnt]; - ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]); + ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]); if (ret) goto over_quota; ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]); @@ -1947,7 +1971,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { int wtype; - wtype = info_idq_free(transfer_from[cnt], 1); + wtype = info_idq_free(transfer_from[cnt], inode_usage); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_inodes[cnt], transfer_from[cnt], wtype); @@ -1955,13 +1979,13 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_space[cnt], transfer_from[cnt], wtype); - dquot_decr_inodes(transfer_from[cnt], 1); + dquot_decr_inodes(transfer_from[cnt], inode_usage); dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], rsv_space); } - dquot_incr_inodes(transfer_to[cnt], 1); + dquot_incr_inodes(transfer_to[cnt], inode_usage); dquot_incr_space(transfer_to[cnt], cur_space); dquot_resv_space(transfer_to[cnt], rsv_space); diff --git a/fs/read_write.c b/fs/read_write.c index 47c1d4484df9..0cc7033aa413 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -356,46 +356,6 @@ out_putf: } #endif -ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - if (!file->f_op->read_iter) - return -EINVAL; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - - iter->type |= READ; - ret = call_read_iter(file, &kiocb, iter); - BUG_ON(ret == -EIOCBQUEUED); - if (ret > 0) - *ppos = kiocb.ki_pos; - return ret; -} -EXPORT_SYMBOL(vfs_iter_read); - -ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - if (!file->f_op->write_iter) - return -EINVAL; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - - iter->type |= WRITE; - ret = call_write_iter(file, &kiocb, iter); - BUG_ON(ret == -EIOCBQUEUED); - if (ret > 0) - *ppos = kiocb.ki_pos; - return ret; -} -EXPORT_SYMBOL(vfs_iter_write); - int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { struct inode *inode; @@ -678,16 +638,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, struct kiocb kiocb; ssize_t ret; - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)) - return -EOPNOTSUPP; - init_sync_kiocb(&kiocb, filp); - if (flags & RWF_HIPRI) - kiocb.ki_flags |= IOCB_HIPRI; - if (flags & RWF_DSYNC) - kiocb.ki_flags |= IOCB_DSYNC; - if (flags & RWF_SYNC) - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + ret = kiocb_set_rw_flags(&kiocb, flags); + if (ret) + return ret; kiocb.ki_pos = *ppos; if (type == READ) @@ -916,86 +870,114 @@ out: } #endif -static ssize_t __do_readv_writev(int type, struct file *file, - struct iov_iter *iter, loff_t *pos, int flags) +static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, + loff_t *pos, int flags) { size_t tot_len; ssize_t ret = 0; + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(type, file, pos, tot_len); + ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) - goto out; - - if (type != READ) - file_start_write(file); + return ret; - if ((type == READ && file->f_op->read_iter) || - (type == WRITE && file->f_op->write_iter)) - ret = do_iter_readv_writev(file, iter, pos, type, flags); + if (file->f_op->read_iter) + ret = do_iter_readv_writev(file, iter, pos, READ, flags); else - ret = do_loop_readv_writev(file, iter, pos, type, flags); - - if (type != READ) - file_end_write(file); - + ret = do_loop_readv_writev(file, iter, pos, READ, flags); out: - if ((ret + (type == READ)) > 0) { - if (type == READ) - fsnotify_access(file); - else - fsnotify_modify(file); - } + if (ret >= 0) + fsnotify_access(file); return ret; } -static ssize_t do_readv_writev(int type, struct file *file, - const struct iovec __user *uvector, - unsigned long nr_segs, loff_t *pos, - int flags) +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, + int flags) { - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; + if (!file->f_op->read_iter) + return -EINVAL; + return do_iter_read(file, iter, ppos, flags); +} +EXPORT_SYMBOL(vfs_iter_read); + +static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, + loff_t *pos, int flags) +{ + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + return 0; + ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) return ret; - ret = __do_readv_writev(type, file, &iter, pos, flags); - kfree(iov); - + if (file->f_op->write_iter) + ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); + else + ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); + if (ret > 0) + fsnotify_modify(file); return ret; } +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, + int flags) +{ + if (!file->f_op->write_iter) + return -EINVAL; + return do_iter_write(file, iter, ppos, flags); +} +EXPORT_SYMBOL(vfs_iter_write); + ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - if (!(file->f_mode & FMODE_READ)) - return -EBADF; - if (!(file->f_mode & FMODE_CAN_READ)) - return -EINVAL; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; - return do_readv_writev(READ, file, vec, vlen, pos, flags); -} + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret >= 0) { + ret = do_iter_read(file, &iter, pos, flags); + kfree(iov); + } + return ret; +} EXPORT_SYMBOL(vfs_readv); ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - if (!(file->f_mode & FMODE_WRITE)) - return -EBADF; - if (!(file->f_mode & FMODE_CAN_WRITE)) - return -EINVAL; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; - return do_readv_writev(WRITE, file, vec, vlen, pos, flags); + ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret >= 0) { + file_start_write(file); + ret = do_iter_write(file, &iter, pos, flags); + file_end_write(file); + kfree(iov); + } + return ret; } - EXPORT_SYMBOL(vfs_writev); static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, @@ -1143,44 +1125,20 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, } #ifdef CONFIG_COMPAT - -static ssize_t compat_do_readv_writev(int type, struct file *file, - const struct compat_iovec __user *uvector, - unsigned long nr_segs, loff_t *pos, - int flags) +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos, int flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; - ret = compat_import_iovec(type, uvector, nr_segs, - UIO_FASTIOV, &iov, &iter); - if (ret < 0) - return ret; - - ret = __do_readv_writev(type, file, &iter, pos, flags); - kfree(iov); - - return ret; -} - -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_READ)) - goto out; - - ret = -EINVAL; - if (!(file->f_mode & FMODE_CAN_READ)) - goto out; - - ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags); - -out: + ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter); + if (ret >= 0) { + ret = do_iter_read(file, &iter, pos, flags); + kfree(iov); + } if (ret > 0) add_rchar(current, ret); inc_syscr(current); @@ -1276,18 +1234,18 @@ static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_WRITE)) - goto out; - - ret = -EINVAL; - if (!(file->f_mode & FMODE_CAN_WRITE)) - goto out; - - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0); + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; -out: + ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter); + if (ret >= 0) { + file_start_write(file); + ret = do_iter_write(file, &iter, pos, flags); + file_end_write(file); + kfree(iov); + } if (ret > 0) add_wchar(current, ret); inc_syscw(current); diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index aca73dd73906..e3c558d1b78c 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi) } static struct item_operations errcatch_ops = { - errcatch_bytes_number, - errcatch_decrement_key, - errcatch_is_left_mergeable, - errcatch_print_item, - errcatch_check_item, - - errcatch_create_vi, - errcatch_check_left, - errcatch_check_right, - errcatch_part_size, - errcatch_unit_num, - errcatch_print_vi + .bytes_number = errcatch_bytes_number, + .decrement_key = errcatch_decrement_key, + .is_left_mergeable = errcatch_is_left_mergeable, + .print_item = errcatch_print_item, + .check_item = errcatch_check_item, + + .create_vi = errcatch_create_vi, + .check_left = errcatch_check_left, + .check_right = errcatch_check_right, + .part_size = errcatch_part_size, + .unit_num = errcatch_unit_num, + .print_vi = errcatch_print_vi }; #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index da01f497180a..a11d773e5ff3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1112,7 +1112,7 @@ static int flush_commit_list(struct super_block *s, depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) __sync_dirty_buffer(jl->j_commit_bh, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(jl->j_commit_bh); reiserfs_write_lock_nested(s, depth); @@ -1271,7 +1271,7 @@ static int _update_journal_header_block(struct super_block *sb, if (reiserfs_barrier_flush(sb)) __sync_dirty_buffer(journal->j_header_bh, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(journal->j_header_bh); @@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s) static void queue_log_writer(struct super_block *s) { - wait_queue_t wait; + wait_queue_entry_t wait; struct reiserfs_journal *journal = SB_JOURNAL(s); set_bit(J_WRITERS_QUEUED, &journal->j_state); diff --git a/fs/select.c b/fs/select.c index bd4b2ccfd346..9d5f15ed87fe 100644 --- a/fs/select.c +++ b/fs/select.c @@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) return table->entry++; } -static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); @@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) return default_wake_function(&dummy_wait, mode, sync, key); } -static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; @@ -633,10 +633,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, goto out_nofds; alloc_size = 6 * size; - bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN); - if (!bits && alloc_size > PAGE_SIZE) - bits = vmalloc(alloc_size); - + bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } @@ -1164,59 +1161,25 @@ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); if (ufdset) { - unsigned long odd; - - if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) - return -EFAULT; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - *fdset++ = h << 32 | l; - nr -= 2; - } - if (odd && __get_user(*fdset, ufdset)) - return -EFAULT; + return compat_get_bitmap(fdset, ufdset, nr); } else { /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that + * kernel fdset at the end, ALIGN makes sure that * actually happens. */ - memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + memset(fdset, 0, ALIGN(nr, BITS_PER_LONG)); + return 0; } - return 0; } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { - unsigned long odd; - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); - if (!ufdset) return 0; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - nr -= 2; - } - if (odd && __put_user(*fdset, ufdset)) - return -EFAULT; - return 0; + return compat_put_bitmap(ufdset, fdset, nr); } diff --git a/fs/seq_file.c b/fs/seq_file.c index ca69fb99e41a..dc7c2be963ed 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -25,21 +25,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - void *buf; - gfp_t gfp = GFP_KERNEL; - - /* - * For high order allocations, use __GFP_NORETRY to avoid oom-killing - - * it's better to fall back to vmalloc() than to kill things. For small - * allocations, just use GFP_KERNEL which will oom kill, thus no need - * for vmalloc fallback. - */ - if (size > PAGE_SIZE) - gfp |= __GFP_NORETRY | __GFP_NOWARN; - buf = kmalloc(size, gfp); - if (!buf && size > PAGE_SIZE) - buf = vmalloc(size); - return buf; + return kvmalloc(size, GFP_KERNEL); } /** diff --git a/fs/signalfd.c b/fs/signalfd.c index 270221fcef42..593b022ac11b 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -38,12 +38,12 @@ void signalfd_cleanup(struct sighand_struct *sighand) /* * The lockless check can race with remove_wait_queue() in progress, * but in this case its caller should run under rcu_read_lock() and - * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. */ if (likely(!waitqueue_active(wqh))) return; - /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */ wake_up_poll(wqh, POLLHUP | POLLFREE); } diff --git a/fs/splice.c b/fs/splice.c index 540c4a44756c..ae41201d0325 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -762,7 +762,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, sd.total_len - left); - ret = vfs_iter_write(out, &from, &sd.pos); + ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; diff --git a/fs/stat.c b/fs/stat.c index f494b182c7c7..c35610845ab1 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -672,6 +672,7 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_bytes -= 512; } } +EXPORT_SYMBOL(__inode_add_bytes); void inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/statfs.c b/fs/statfs.c index 4e4623c7a126..fab9b6a3c116 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -38,6 +38,8 @@ static int flags_by_sb(int s_flags) flags |= ST_SYNCHRONOUS; if (s_flags & MS_MANDLOCK) flags |= ST_MANDLOCK; + if (s_flags & MS_RDONLY) + flags |= ST_RDONLY; return flags; } @@ -244,6 +246,7 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) #ifdef CONFIG_COMPAT static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) { + struct compat_statfs buf; if (sizeof ubuf->f_blocks == 4) { if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) @@ -257,20 +260,20 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * && (kbuf->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + memset(&buf, 0, sizeof(struct compat_statfs)); + buf.f_type = kbuf->f_type; + buf.f_bsize = kbuf->f_bsize; + buf.f_blocks = kbuf->f_blocks; + buf.f_bfree = kbuf->f_bfree; + buf.f_bavail = kbuf->f_bavail; + buf.f_files = kbuf->f_files; + buf.f_ffree = kbuf->f_ffree; + buf.f_namelen = kbuf->f_namelen; + buf.f_fsid.val[0] = kbuf->f_fsid.val[0]; + buf.f_fsid.val[1] = kbuf->f_fsid.val[1]; + buf.f_frsize = kbuf->f_frsize; + buf.f_flags = kbuf->f_flags; + if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs))) return -EFAULT; return 0; } @@ -299,6 +302,7 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { + struct compat_statfs64 buf; if (sizeof(ubuf->f_bsize) == 4) { if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) @@ -312,20 +316,20 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat && (kbuf->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + memset(&buf, 0, sizeof(struct compat_statfs64)); + buf.f_type = kbuf->f_type; + buf.f_bsize = kbuf->f_bsize; + buf.f_blocks = kbuf->f_blocks; + buf.f_bfree = kbuf->f_bfree; + buf.f_bavail = kbuf->f_bavail; + buf.f_files = kbuf->f_files; + buf.f_ffree = kbuf->f_ffree; + buf.f_namelen = kbuf->f_namelen; + buf.f_fsid.val[0] = kbuf->f_fsid.val[0]; + buf.f_fsid.val[1] = kbuf->f_fsid.val[1]; + buf.f_frsize = kbuf->f_frsize; + buf.f_flags = kbuf->f_flags; + if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs64))) return -EFAULT; return 0; } diff --git a/fs/sync.c b/fs/sync.c index 11ba023434b1..2a54c1f22035 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -192,7 +192,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) spin_unlock(&inode->i_lock); mark_inode_dirty_sync(inode); } - return call_fsync(file, start, end, datasync); + return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 5bdae85ceef7..f5191cb2c947 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -45,7 +45,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/timerfd.c b/fs/timerfd.c index c543cdb5f8ed..ece0c02d7e63 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -169,7 +169,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) } static int timerfd_setup(struct timerfd_ctx *ctx, int flags, - const struct itimerspec *ktmr) + const struct itimerspec64 *ktmr) { enum hrtimer_mode htmode; ktime_t texp; @@ -178,10 +178,10 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; - texp = timespec_to_ktime(ktmr->it_value); + texp = timespec64_to_ktime(ktmr->it_value); ctx->expired = 0; ctx->ticks = 0; - ctx->tintv = timespec_to_ktime(ktmr->it_interval); + ctx->tintv = timespec64_to_ktime(ktmr->it_interval); if (isalarm(ctx)) { alarm_init(&ctx->t.alarm, @@ -432,16 +432,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) } static int do_timerfd_settime(int ufd, int flags, - const struct itimerspec *new, - struct itimerspec *old) + const struct itimerspec64 *new, + struct itimerspec64 *old) { struct fd f; struct timerfd_ctx *ctx; int ret; if ((flags & ~TFD_SETTIME_FLAGS) || - !timespec_valid(&new->it_value) || - !timespec_valid(&new->it_interval)) + !itimerspec64_valid(new)) return -EINVAL; ret = timerfd_fget(ufd, &f); @@ -487,8 +486,8 @@ static int do_timerfd_settime(int ufd, int flags, hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); } - old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - old->it_interval = ktime_to_timespec(ctx->tintv); + old->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); + old->it_interval = ktime_to_timespec64(ctx->tintv); /* * Re-program the timer to the new value ... @@ -500,7 +499,7 @@ static int do_timerfd_settime(int ufd, int flags, return ret; } -static int do_timerfd_gettime(int ufd, struct itimerspec *t) +static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) { struct fd f; struct timerfd_ctx *ctx; @@ -525,8 +524,8 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t) hrtimer_restart(&ctx->t.tmr); } } - t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - t->it_interval = ktime_to_timespec(ctx->tintv); + t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); + t->it_interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); fdput(f); return 0; @@ -536,15 +535,15 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct itimerspec __user *, utmr, struct itimerspec __user *, otmr) { - struct itimerspec new, old; + struct itimerspec64 new, old; int ret; - if (copy_from_user(&new, utmr, sizeof(new))) + if (get_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; - if (otmr && copy_to_user(otmr, &old, sizeof(old))) + if (otmr && put_itimerspec64(&old, otmr)) return -EFAULT; return ret; @@ -552,11 +551,11 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) { - struct itimerspec kotmr; + struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; + return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT @@ -564,15 +563,15 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct compat_itimerspec __user *, utmr, struct compat_itimerspec __user *, otmr) { - struct itimerspec new, old; + struct itimerspec64 new, old; int ret; - if (get_compat_itimerspec(&new, utmr)) + if (get_compat_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; - if (otmr && put_compat_itimerspec(otmr, &old)) + if (otmr && put_compat_itimerspec64(&old, otmr)) return -EFAULT; return ret; } @@ -580,10 +579,10 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct compat_itimerspec __user *, otmr) { - struct itimerspec kotmr; + struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0; + return put_compat_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #endif diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 21d36d284735..328e89c2cf83 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -266,7 +266,7 @@ static const struct super_operations tracefs_super_operations = { static int trace_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr trace_files[] = {{""}}; + static const struct tree_descr trace_files[] = {{""}}; struct tracefs_fs_info *fsi; int err; diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index b0d0623c83ed..83a961bf7280 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -61,3 +61,16 @@ config UBIFS_FS_ENCRYPTION feature is similar to ecryptfs, but it is more memory efficient since it avoids caching the encrypted and decrypted pages in the page cache. + +config UBIFS_FS_SECURITY + bool "UBIFS Security Labels" + depends on UBIFS_FS + default y + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the ubifs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 718b749fa11a..7cd8a7b95299 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2391,8 +2391,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) ubifs_dump_node(c, sa->node); return -EINVAL; } - if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && - sa->type != UBIFS_XENT_NODE) { + if (sb->type != UBIFS_INO_NODE && sb->type != UBIFS_DENT_NODE && + sb->type != UBIFS_XENT_NODE) { ubifs_err(c, "bad node type %d", sb->type); ubifs_dump_node(c, sb->node); return -EINVAL; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 8049851cac42..566079d9b402 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -121,7 +121,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = - ubifs_current_time(inode); + current_time(inode); inode->i_mapping->nrpages = 0; switch (mode & S_IFMT) { @@ -766,7 +766,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); ihold(inode); - inode->i_ctime = ubifs_current_time(inode); + inode->i_ctime = current_time(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; @@ -841,7 +841,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = ubifs_current_time(dir); + inode->i_ctime = current_time(dir); drop_nlink(inode); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; @@ -945,7 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = ubifs_current_time(dir); + inode->i_ctime = current_time(dir); clear_nlink(inode); drop_nlink(dir); dir->i_size -= sz_change; @@ -1422,7 +1422,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the @i_ctime for inodes on a * rename. */ - time = ubifs_current_time(old_dir); + time = current_time(old_dir); old_inode->i_ctime = time; /* We must adjust parent link count when renaming directories */ @@ -1595,7 +1595,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, lock_4_inodes(old_dir, new_dir, NULL, NULL); - time = ubifs_current_time(old_dir); + time = current_time(old_dir); fst_inode->i_ctime = time; snd_inode->i_ctime = time; old_dir->i_mtime = old_dir->i_ctime = time; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index d9ae86f96df7..2cda3d67e2d0 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1196,7 +1196,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); err = ubifs_jnl_truncate(c, inode, old_size, new_size); @@ -1243,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); /* 'truncate_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1420,7 +1420,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, */ static int update_mctime(struct inode *inode) { - struct timespec now = ubifs_current_time(inode); + struct timespec now = current_time(inode); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1434,7 +1434,7 @@ static int update_mctime(struct inode *inode) return err; mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); @@ -1511,7 +1511,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; - struct timespec now = ubifs_current_time(inode); + struct timespec now = current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; @@ -1579,7 +1579,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index da519ba205f6..fdc311246807 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -53,7 +53,7 @@ void ubifs_set_inode_flags(struct inode *inode) * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags. * @ioctl_flags: flags to convert * - * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags + * This function converts ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags * (@UBIFS_COMPR_FL, etc). */ static int ioctl2ubifs(int ioctl_flags) @@ -78,8 +78,8 @@ static int ioctl2ubifs(int ioctl_flags) * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags. * @ubifs_flags: flags to convert * - * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags - * (@FS_COMPR_FL, etc). + * This function converts UBIFS inode flags (@UBIFS_COMPR_FL, etc) to ioctl + * flags (@FS_COMPR_FL, etc). */ static int ubifs2ioctl(int ubifs_flags) { @@ -126,7 +126,7 @@ static int setflags(struct inode *inode, int flags) ui->flags = ioctl2ubifs(flags); ubifs_set_inode_flags(inode); - inode->i_ctime = ubifs_current_time(inode); + inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 8ece6ca58c0b..caf83d68fb38 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -225,16 +225,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c, } /** - * ubifs_current_time - round current time to time granularity. - * @inode: inode - */ -static inline struct timespec ubifs_current_time(struct inode *inode) -{ - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -} - -/** * ubifs_tnc_lookup - look up a file-system node. * @c: UBIFS file-system description object * @key: node key to lookup diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 586d59347fff..3af4472061cc 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -442,7 +442,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, { int empty_offs, pad_len; - lnum = lnum; dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); ubifs_assert(!(*offs & 7)); diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 7f1ead29e727..8c25081a5109 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -84,6 +84,8 @@ static int create_default_filesystem(struct ubifs_info *c) int min_leb_cnt = UBIFS_MIN_LEB_CNT; long long tmp64, main_bytes; __le64 tmp_le64; + __le32 tmp_le32; + struct timespec ts; /* Some functions called from here depend on the @c->key_len filed */ c->key_len = UBIFS_SK_LEN; @@ -298,13 +300,17 @@ static int create_default_filesystem(struct ubifs_info *c) ino->ch.node_type = UBIFS_INO_NODE; ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); ino->nlink = cpu_to_le32(2); - tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); + + ktime_get_real_ts(&ts); + ts = timespec_trunc(ts, DEFAULT_TIME_GRAN); + tmp_le64 = cpu_to_le64(ts.tv_sec); ino->atime_sec = tmp_le64; ino->ctime_sec = tmp_le64; ino->mtime_sec = tmp_le64; - ino->atime_nsec = 0; - ino->ctime_nsec = 0; - ino->mtime_nsec = 0; + tmp_le32 = cpu_to_le32(ts.tv_nsec); + ino->atime_nsec = tmp_le32; + ino->ctime_nsec = tmp_le32; + ino->mtime_nsec = tmp_le32; ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4da10a6d702a..298b4d89eee9 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1753,13 +1753,23 @@ int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ extern const struct xattr_handler *ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); -int ubifs_init_security(struct inode *dentry, struct inode *inode, - const struct qstr *qstr); int ubifs_xattr_set(struct inode *host, const char *name, const void *value, size_t size, int flags); ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, size_t size); +#ifdef CONFIG_UBIFS_FS_SECURITY +extern int ubifs_init_security(struct inode *dentry, struct inode *inode, + const struct qstr *qstr); +#else +static inline int ubifs_init_security(struct inode *dentry, + struct inode *inode, const struct qstr *qstr) +{ + return 0; +} +#endif + + /* super.c */ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index efe00fcb8b75..6c9e62c2ef55 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -152,7 +152,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -234,7 +234,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, mutex_unlock(&ui->ui_mutex); mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -488,7 +488,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, return err; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); @@ -559,6 +559,7 @@ out_free: return err; } +#ifdef CONFIG_UBIFS_FS_SECURITY static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { @@ -599,6 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, } return err; } +#endif static int xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, @@ -639,15 +641,19 @@ static const struct xattr_handler ubifs_trusted_xattr_handler = { .set = xattr_set, }; +#ifdef CONFIG_UBIFS_FS_SECURITY static const struct xattr_handler ubifs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = xattr_get, .set = xattr_set, }; +#endif const struct xattr_handler *ubifs_xattr_handlers[] = { &ubifs_user_xattr_handler, &ubifs_trusted_xattr_handler, +#ifdef CONFIG_UBIFS_FS_SECURITY &ubifs_security_xattr_handler, +#endif NULL }; diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a0376a2c1c29..f80be4c5df9d 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -82,7 +82,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ufs_error (sb, "ufs_free_fragments", "bit already cleared for fragment %u", i); } - + + inode_sub_bytes(inode, count << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -184,6 +185,7 @@ do_more: ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); @@ -398,10 +400,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * There is not enough space for user on the device */ - if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - mutex_unlock(&UFS_SB(sb)->s_lock); - UFSD("EXIT (FAILED)\n"); - return 0; + if (unlikely(ufs_freefrags(uspi) <= uspi->s_root_blocks)) { + if (!capable(CAP_SYS_RESOURCE)) { + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT (FAILED)\n"); + return 0; + } } if (goal >= uspi->s_size) @@ -419,12 +423,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (result) { ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); + *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); - write_sequnlock(&UFS_I(inode)->meta_lock); - *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); + write_sequnlock(&UFS_I(inode)->meta_lock); } mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); @@ -437,8 +441,10 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, result = ufs_add_fragments(inode, tmp, oldcount, newcount); if (result) { *err = 0; + read_seqlock_excl(&UFS_I(inode)->meta_lock); UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); + read_sequnlock_excl(&UFS_I(inode)->meta_lock); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); mutex_unlock(&UFS_SB(sb)->s_lock); @@ -449,39 +455,29 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * allocate new block and move data */ - switch (fs32_to_cpu(sb, usb1->fs_optim)) { - case UFS_OPTSPACE: + if (fs32_to_cpu(sb, usb1->fs_optim) == UFS_OPTSPACE) { request = newcount; - if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree - > uspi->s_dsize * uspi->s_minfree / (2 * 100)) - break; - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - break; - default: - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - - case UFS_OPTTIME: + if (uspi->cs_total.cs_nffree < uspi->s_space_to_time) + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + } else { request = uspi->s_fpb; - if (uspi->cs_total.cs_nffree < uspi->s_dsize * - (uspi->s_minfree - 2) / 100) - break; - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - break; + if (uspi->cs_total.cs_nffree > uspi->s_time_to_space) + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE); } result = ufs_alloc_fragments (inode, cgno, goal, request, err); if (result) { ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); + mutex_unlock(&UFS_SB(sb)->s_lock); ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, uspi->s_sbbase + result, locked_page); + *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); - write_sequnlock(&UFS_I(inode)->meta_lock); - *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - mutex_unlock(&UFS_SB(sb)->s_lock); + write_sequnlock(&UFS_I(inode)->meta_lock); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -494,6 +490,20 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return 0; } +static bool try_add_frags(struct inode *inode, unsigned frags) +{ + unsigned size = frags * i_blocksize(inode); + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, size); + if (unlikely((u32)inode->i_blocks != inode->i_blocks)) { + __inode_sub_bytes(inode, size); + spin_unlock(&inode->i_lock); + return false; + } + spin_unlock(&inode->i_lock); + return true; +} + static u64 ufs_add_fragments(struct inode *inode, u64 fragment, unsigned oldcount, unsigned newcount) { @@ -530,6 +540,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, for (i = oldcount; i < newcount; i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) return 0; + + if (!try_add_frags(inode, count)) + return 0; /* * Block can be extended */ @@ -647,6 +660,7 @@ cg_found: ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; + inode_sub_bytes(inode, i << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); @@ -657,6 +671,8 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; + if (!try_add_frags(inode, count)) + return 0; for (i = 0; i < count; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); @@ -716,6 +732,8 @@ norot: return INVBLOCK; ucpi->c_rotor = result; gotit: + if (!try_add_frags(inode, uspi->s_fpb)) + return 0; blkno = ufs_fragstoblks(result); ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index de01b8f2aa78..48609f1d9580 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -53,7 +53,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 9774555b3721..d1dd8cc33179 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -176,6 +176,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; struct inode * inode; + struct timespec64 ts; unsigned cg, bit, i, j, start; struct ufs_inode_info *ufsi; int err = -ENOSPC; @@ -323,8 +324,9 @@ cg_found: lock_buffer(bh); ufs2_inode = (struct ufs2_inode *)bh->b_data; ufs2_inode += ufs_inotofsbo(inode->i_ino); - ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec); - ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec); + ktime_get_real_ts64(&ts); + ufs2_inode->ui_birthtime = cpu_to_fs64(sb, ts.tv_sec); + ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec); mark_buffer_dirty(bh); unlock_buffer(bh); if (sb->s_flags & MS_SYNCHRONOUS) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7e41aee7b69a..f36d6a53687d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -235,7 +235,8 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, p = ufs_get_direct_data_ptr(uspi, ufsi, block); tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), - new_size, err, locked_page); + new_size - (lastfrag & uspi->s_fpbmask), err, + locked_page); return tmp != 0; } @@ -284,7 +285,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, uspi->s_fpb, err, locked_page); + goal, nfrags, err, locked_page); if (!tmp) { *err = -ENOSPC; @@ -400,11 +401,20 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff u64 phys64 = 0; unsigned frag = fragment & uspi->s_fpbmask; - if (!create) { - phys64 = ufs_frag_map(inode, offsets, depth); - goto out; - } + phys64 = ufs_frag_map(inode, offsets, depth); + if (!create) + goto done; + if (phys64) { + if (fragment >= UFS_NDIR_FRAGMENT) + goto done; + read_seqlock_excl(&UFS_I(inode)->meta_lock); + if (fragment < UFS_I(inode)->i_lastfrag) { + read_sequnlock_excl(&UFS_I(inode)->meta_lock); + goto done; + } + read_sequnlock_excl(&UFS_I(inode)->meta_lock); + } /* This code entered only while writing ....? */ mutex_lock(&UFS_I(inode)->truncate_mutex); @@ -448,6 +458,11 @@ out: } mutex_unlock(&UFS_I(inode)->truncate_mutex); return err; + +done: + if (phys64) + map_bh(bh_result, sb, phys64 + frag); + return 0; } static int ufs_writepage(struct page *page, struct writeback_control *wbc) @@ -551,10 +566,8 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) */ inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink)); - if (inode->i_nlink == 0) { - ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); - return -1; - } + if (inode->i_nlink == 0) + return -ESTALE; /* * Linux now has 32-bit uid and gid, so we can support EFT. @@ -563,9 +576,9 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode)); inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); - inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); - inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); - inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); + inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); + inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); + inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; @@ -599,10 +612,8 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) */ inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink)); - if (inode->i_nlink == 0) { - ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); - return -1; - } + if (inode->i_nlink == 0) + return -ESTALE; /* * Linux now has 32-bit uid and gid, so we can support EFT. @@ -642,7 +653,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * bh; struct inode *inode; - int err; + int err = -EIO; UFSD("ENTER, ino %lu\n", ino); @@ -677,9 +688,10 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) err = ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); } - + brelse(bh); if (err) goto bad_inode; + inode->i_version++; ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; @@ -688,15 +700,13 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) ufs_set_inode_ops(inode); - brelse(bh); - UFSD("EXIT\n"); unlock_new_inode(inode); return inode; bad_inode: iget_failed(inode); - return ERR_PTR(-EIO); + return ERR_PTR(err); } static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) @@ -841,8 +851,11 @@ void ufs_evict_inode(struct inode * inode) truncate_inode_pages_final(&inode->i_data); if (want_delete) { inode->i_size = 0; - if (inode->i_blocks) + if (inode->i_blocks && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) ufs_truncate_blocks(inode); + ufs_update_inode(inode, inode_needs_sync(inode)); } invalidate_inode_buffers(inode); @@ -868,7 +881,6 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count) ctx->to = from + count; } -#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) static void ufs_trunc_direct(struct inode *inode) @@ -1100,25 +1112,30 @@ out: return err; } -static void __ufs_truncate_blocks(struct inode *inode) +static void ufs_truncate_blocks(struct inode *inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; unsigned offsets[4]; - int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); + int depth; int depth2; unsigned i; struct ufs_buffer_head *ubh[3]; void *p; u64 block; - if (!depth) - return; + if (inode->i_size) { + sector_t last = (inode->i_size - 1) >> uspi->s_bshift; + depth = ufs_block_to_path(inode, last, offsets); + if (!depth) + return; + } else { + depth = 1; + } - /* find the last non-zero in offsets[] */ for (depth2 = depth - 1; depth2; depth2--) - if (offsets[depth2]) + if (offsets[depth2] != uspi->s_apb - 1) break; mutex_lock(&ufsi->truncate_mutex); @@ -1127,9 +1144,8 @@ static void __ufs_truncate_blocks(struct inode *inode) offsets[0] = UFS_IND_BLOCK; } else { /* get the blocks that should be partially emptied */ - p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]); + p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++); for (i = 0; i < depth2; i++) { - offsets[i]++; /* next branch is fully freed */ block = ufs_data_ptr_to_cpu(sb, p); if (!block) break; @@ -1140,7 +1156,7 @@ static void __ufs_truncate_blocks(struct inode *inode) write_sequnlock(&ufsi->meta_lock); break; } - p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]); + p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]++); } while (i--) free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1); @@ -1155,7 +1171,9 @@ static void __ufs_truncate_blocks(struct inode *inode) free_full_branch(inode, block, i - UFS_IND_BLOCK + 1); } } + read_seqlock_excl(&ufsi->meta_lock); ufsi->i_lastfrag = DIRECT_FRAGMENT; + read_sequnlock_excl(&ufsi->meta_lock); mark_inode_dirty(inode); mutex_unlock(&ufsi->truncate_mutex); } @@ -1183,7 +1201,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) truncate_setsize(inode, size); - __ufs_truncate_blocks(inode); + ufs_truncate_blocks(inode); inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); out: @@ -1191,16 +1209,6 @@ out: return err; } -static void ufs_truncate_blocks(struct inode *inode) -{ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - __ufs_truncate_blocks(inode); -} - int ufs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 131b2b77c818..0a4f58a5073c 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -480,7 +480,7 @@ static void ufs_setup_cstotal(struct super_block *sb) usb3 = ubh_get_usb_third(uspi); if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && - (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) || mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir); @@ -596,9 +596,7 @@ static void ufs_put_cstotal(struct super_block *sb) usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); - if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && - (usb1->fs_flags & UFS_FLAGS_UPDATED)) || - mtype == UFS_MOUNT_UFSTYPE_UFS2) { + if (mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ usb2->fs_un.fs_u2.cs_ndir = cpu_to_fs64(sb, uspi->cs_total.cs_ndir); @@ -608,16 +606,26 @@ static void ufs_put_cstotal(struct super_block *sb) cpu_to_fs64(sb, uspi->cs_total.cs_nifree); usb3->fs_un1.fs_u2.cs_nffree = cpu_to_fs64(sb, uspi->cs_total.cs_nffree); - } else { - usb1->fs_cstotal.cs_ndir = - cpu_to_fs32(sb, uspi->cs_total.cs_ndir); - usb1->fs_cstotal.cs_nbfree = - cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); - usb1->fs_cstotal.cs_nifree = - cpu_to_fs32(sb, uspi->cs_total.cs_nifree); - usb1->fs_cstotal.cs_nffree = - cpu_to_fs32(sb, uspi->cs_total.cs_nffree); + goto out; + } + + if (mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) { + /* store stats in both old and new places */ + usb2->fs_un.fs_u2.cs_ndir = + cpu_to_fs64(sb, uspi->cs_total.cs_ndir); + usb2->fs_un.fs_u2.cs_nbfree = + cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); + usb3->fs_un1.fs_u2.cs_nifree = + cpu_to_fs64(sb, uspi->cs_total.cs_nifree); + usb3->fs_un1.fs_u2.cs_nffree = + cpu_to_fs64(sb, uspi->cs_total.cs_nffree); } + usb1->fs_cstotal.cs_ndir = cpu_to_fs32(sb, uspi->cs_total.cs_ndir); + usb1->fs_cstotal.cs_nbfree = cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); + usb1->fs_cstotal.cs_nifree = cpu_to_fs32(sb, uspi->cs_total.cs_nifree); + usb1->fs_cstotal.cs_nffree = cpu_to_fs32(sb, uspi->cs_total.cs_nffree); +out: ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_print_super_stuff(sb, usb1, usb2, usb3); UFSD("EXIT\n"); @@ -746,6 +754,23 @@ static void ufs_put_super(struct super_block *sb) return; } +static u64 ufs_max_bytes(struct super_block *sb) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int bits = uspi->s_apbshift; + u64 res; + + if (bits > 21) + res = ~0ULL; + else + res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) + + (1LL << (3*bits)); + + if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift)) + return MAX_LFS_FILESIZE; + return res << uspi->s_bshift; +} + static int ufs_fill_super(struct super_block *sb, void *data, int silent) { struct ufs_sb_info * sbi; @@ -812,9 +837,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; - /* Keep 2Gig file limit. Some UFS variants need to override - this but as I don't know which I'll let those in the know loosen - the rules */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); @@ -980,6 +1004,13 @@ again: flags |= UFS_ST_SUN; } + if ((flags & UFS_ST_MASK) == UFS_ST_44BSD && + uspi->s_postblformat == UFS_42POSTBLFMT) { + if (!silent) + pr_err("this is not a 44bsd filesystem"); + goto failed; + } + /* * Check ufs magic number */ @@ -1127,8 +1158,8 @@ magic_found: uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { - uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); - uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + uspi->s_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); + uspi->s_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); } else { uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); @@ -1177,6 +1208,18 @@ magic_found: uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff); uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff); + uspi->s_root_blocks = mul_u64_u32_div(uspi->s_dsize, + uspi->s_minfree, 100); + if (uspi->s_minfree <= 5) { + uspi->s_time_to_space = ~0ULL; + uspi->s_space_to_time = 0; + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE); + } else { + uspi->s_time_to_space = (uspi->s_root_blocks / 2) + 1; + uspi->s_space_to_time = mul_u64_u32_div(uspi->s_dsize, + uspi->s_minfree - 2, 100) - 1; + } + /* * Compute another frequently used values */ @@ -1212,6 +1255,7 @@ magic_found: "fast symlink size (%u)\n", uspi->s_maxsymlinklen); uspi->s_maxsymlinklen = maxsymlen; } + sb->s_maxbytes = ufs_max_bytes(sb); sb->s_max_links = UFS_LINK_MAX; inode = ufs_iget(sb, UFS_ROOTINO); @@ -1365,19 +1409,17 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) mutex_lock(&UFS_SB(sb)->s_lock); usb3 = ubh_get_usb_third(uspi); - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) buf->f_type = UFS2_MAGIC; - buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); - } else { + else buf->f_type = UFS_MAGIC; - buf->f_blocks = uspi->s_dsize; - } - buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) + - uspi->cs_total.cs_nffree; + + buf->f_blocks = uspi->s_dsize; + buf->f_bfree = ufs_freefrags(uspi); buf->f_ffree = uspi->cs_total.cs_nifree; buf->f_bsize = sb->s_blocksize; - buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree)) - ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; + buf->f_bavail = (buf->f_bfree > uspi->s_root_blocks) + ? (buf->f_bfree - uspi->s_root_blocks) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; buf->f_fsid.val[0] = (u32)id; diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 0cbd5d340b67..150eef6f1233 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -733,10 +733,8 @@ struct ufs_sb_private_info { __u32 s_dblkno; /* offset of first data after cg */ __u32 s_cgoffset; /* cylinder group offset in cylinder */ __u32 s_cgmask; /* used to calc mod fs_ntrak */ - __u32 s_size; /* number of blocks (fragments) in fs */ - __u32 s_dsize; /* number of data blocks in fs */ - __u64 s_u2_size; /* ufs2: number of blocks (fragments) in fs */ - __u64 s_u2_dsize; /*ufs2: number of data blocks in fs */ + __u64 s_size; /* number of blocks (fragments) in fs */ + __u64 s_dsize; /* number of data blocks in fs */ __u32 s_ncg; /* number of cylinder groups */ __u32 s_bsize; /* size of basic blocks */ __u32 s_fsize; /* size of fragments */ @@ -793,6 +791,9 @@ struct ufs_sb_private_info { __u32 s_maxsymlinklen;/* upper limit on fast symlinks' size */ __s32 fs_magic; /* filesystem magic */ unsigned int s_dirblksize; + __u64 s_root_blocks; + __u64 s_time_to_space; + __u64 s_space_to_time; }; /* diff --git a/fs/ufs/util.c b/fs/ufs/util.c index f41ad0a6106f..02497a492eb2 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -243,9 +243,8 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index) { - struct page *page; - - page = find_lock_page(mapping, index); + struct inode *inode = mapping->host; + struct page *page = find_lock_page(mapping, index); if (!page) { page = read_mapping_page(mapping, index, NULL); @@ -253,7 +252,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping, printk(KERN_ERR "ufs_change_blocknr: " "read_mapping_page error: ino %lu, index: %lu\n", mapping->host->i_ino, index); - goto out; + return page; } lock_page(page); @@ -262,8 +261,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping, /* Truncate got there first */ unlock_page(page); put_page(page); - page = NULL; - goto out; + return NULL; } if (!PageUptodate(page) || PageError(page)) { @@ -272,11 +270,12 @@ struct page *ufs_get_locked_page(struct address_space *mapping, printk(KERN_ERR "ufs_change_blocknr: " "can not read page: ino %lu, index: %lu\n", - mapping->host->i_ino, index); + inode->i_ino, index); - page = ERR_PTR(-EIO); + return ERR_PTR(-EIO); } } -out: + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); return page; } diff --git a/fs/ufs/util.h b/fs/ufs/util.h index b7fbf53dbc81..9fc7119a1551 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -350,16 +350,11 @@ static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, #define ubh_blkmap(ubh,begin,bit) \ ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) -/* - * Determine the number of available frags given a - * percentage to hold in reserve. - */ static inline u64 -ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved) +ufs_freefrags(struct ufs_sb_private_info *uspi) { return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + - uspi->cs_total.cs_nffree - - (uspi->s_dsize * (percentreserved) / 100); + uspi->cs_total.cs_nffree; } /* @@ -473,15 +468,19 @@ static inline unsigned _ubh_find_last_zero_bit_( static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned block) { + u8 mask; switch (uspi->s_fpb) { case 8: return (*ubh_get_addr (ubh, begin + block) == 0xff); case 4: - return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2))); + mask = 0x0f << ((block & 0x01) << 2); + return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask; case 2: - return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1))); + mask = 0x03 << ((block & 0x03) << 1); + return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask; case 1: - return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07))); + mask = 0x01 << (block & 0x07); + return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask; } return 0; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f7555fc25877..cadcd12a3d35 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx { struct userfaultfd_wait_queue { struct uffd_msg msg; - wait_queue_t wq; + wait_queue_entry_t wq; struct userfaultfd_ctx *ctx; bool waken; }; @@ -91,7 +91,7 @@ struct userfaultfd_wake_range { unsigned long len; }; -static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, +static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { struct userfaultfd_wake_range *range = key; @@ -129,7 +129,7 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, * wouldn't be enough, the smp_mb__before_spinlock is * enough to avoid an explicit smp_mb() here. */ - list_del_init(&wq->task_list); + list_del_init(&wq->entry); out: return ret; } @@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); if (!pte) goto out; @@ -243,6 +244,7 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -340,9 +342,28 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) bool must_wait, return_to_userland; long blocking_state; - BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - ret = VM_FAULT_SIGBUS; + + /* + * We don't do userfault handling for the final child pid update. + * + * We also don't do userfault handling during + * coredumping. hugetlbfs has the special + * follow_hugetlb_page() to skip missing pages in the + * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with + * the no_page_table() helper in follow_page_mask(), but the + * shmem_vm_ops->fault method is invoked even during + * coredumping without mmap_sem and it ends up here. + */ + if (current->flags & (PF_EXITING|PF_DUMPCORE)) + goto out; + + /* + * Coredumping runs without mmap_sem so we can only check that + * the mmap_sem is held, if PF_DUMPCORE was not set. + */ + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -361,12 +382,6 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) goto out; /* - * We don't do userfault handling for the final child pid update. - */ - if (current->flags & PF_EXITING) - goto out; - - /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY @@ -435,7 +450,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vmf->address, + must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, + vmf->address, vmf->flags, reason); up_read(&mm->mmap_sem); @@ -509,13 +525,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * and it's fine not to block on the spinlock. The uwq on this * kernel stack can be released after the list_del_init. */ - if (!list_empty_careful(&uwq.wq.task_list)) { + if (!list_empty_careful(&uwq.wq.entry)) { spin_lock(&ctx->fault_pending_wqh.lock); /* * No need of list_del_init(), the uwq on the stack * will be freed shortly anyway. */ - list_del(&uwq.wq.task_list); + list_del(&uwq.wq.entry); spin_unlock(&ctx->fault_pending_wqh.lock); } @@ -847,7 +863,7 @@ wakeup: static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_head_t *wqh) { - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; VM_BUG_ON(!spin_is_locked(&wqh->lock)); @@ -856,7 +872,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in( if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list); + wq = list_last_entry(&wqh->head, typeof(*wq), entry); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; @@ -990,14 +1006,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, * changes __remove_wait_queue() to use * list_del_init() in turn breaking the * !list_empty_careful() check in - * handle_userfault(). The uwq->wq.task_list + * handle_userfault(). The uwq->wq.head list * must never be empty at any time during the * refile, or the waitqueue could disappear * from under us. The "wait_queue_head_t" * parameter of __remove_wait_queue() is unused * anyway. */ - list_del(&uwq->wq.task_list); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->fault_wqh, &uwq->wq); write_seqcount_end(&ctx->refile_seq); @@ -1019,7 +1035,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, fork_nctx = (struct userfaultfd_ctx *) (unsigned long) uwq->msg.arg.reserved.reserved1; - list_move(&uwq->wq.task_list, &fork_event); + list_move(&uwq->wq.entry, &fork_event); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1056,8 +1072,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!list_empty(&fork_event)) { uwq = list_first_entry(&fork_event, typeof(*uwq), - wq.task_list); - list_del(&uwq->wq.task_list); + wq.entry); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->event_wqh, &uwq->wq); userfaultfd_event_complete(ctx, uwq); } @@ -1101,11 +1117,6 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, static void __wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { - unsigned long start, end; - - start = range->start; - end = range->start + range->len; - spin_lock(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) @@ -1734,17 +1745,17 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) { struct userfaultfd_ctx *ctx = f->private_data; - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; unsigned long pending = 0, total = 0; spin_lock(&ctx->fault_pending_wqh.lock); - list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); pending++; total++; } - list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); total++; } diff --git a/fs/xattr.c b/fs/xattr.c index 7e3317cf4045..464c94bf65f9 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -431,12 +431,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, if (size) { if (size > XATTR_SIZE_MAX) return -E2BIG; - kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!kvalue) { - kvalue = vmalloc(size); - if (!kvalue) - return -ENOMEM; - } + kvalue = kvmalloc(size, GFP_KERNEL); + if (!kvalue) + return -ENOMEM; if (copy_from_user(kvalue, value, size)) { error = -EFAULT; goto out; @@ -528,12 +525,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, if (size) { if (size > XATTR_SIZE_MAX) size = XATTR_SIZE_MAX; - kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!kvalue) { - kvalue = vmalloc(size); - if (!kvalue) - return -ENOMEM; - } + kvalue = kvzalloc(size, GFP_KERNEL); + if (!kvalue) + return -ENOMEM; } error = vfs_getxattr(d, kname, kvalue, size); @@ -611,12 +605,9 @@ listxattr(struct dentry *d, char __user *list, size_t size) if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; - klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL); - if (!klist) { - klist = vmalloc(size); - if (!klist) - return -ENOMEM; - } + klist = kvmalloc(size, GFP_KERNEL); + if (!klist) + return -ENOMEM; } error = vfs_listxattr(d, klist, size); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 35faf128f36d..1b98cfa342ab 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -96,3 +96,16 @@ config XFS_DEBUG not useful unless you are debugging a particular problem. Say N unless you are an XFS developer, or you play one on TV. + +config XFS_ASSERT_FATAL + bool "XFS fatal asserts" + default y + depends on XFS_FS && XFS_DEBUG + help + Set the default DEBUG mode ASSERT failure behavior. + + Say Y here to cause DEBUG mode ASSERT failures to result in fatal + errors that BUG() the kernel by default. If you say N, ASSERT failures + result in warnings. + + This behavior can be modified at runtime via sysfs. diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 5c90f82b8f6b..a6e955bfead8 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -98,8 +98,7 @@ xfs-y += xfs_aops.o \ xfs_sysfs.o \ xfs_trans.o \ xfs_xattr.o \ - kmem.o \ - uuid.o + kmem.o # low-level transaction/log code xfs-y += xfs_log.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 780fc8986dab..393b6849aeb3 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -67,7 +67,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 33db69be4832..b008ff3250eb 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -111,8 +111,7 @@ xfs_ag_resv_critical( /* Critically low if less than 10% or max btree height remains. */ return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, - pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL, - XFS_RANDOM_AG_RESV_CRITICAL); + pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); } /* diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7486401ccbd3..744dcaec34cc 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -606,7 +606,7 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { /* * Read in the allocation group free block array. */ -STATIC int /* error */ +int /* error */ xfs_alloc_read_agfl( xfs_mount_t *mp, /* mount point structure */ xfs_trans_t *tp, /* transaction pointer */ @@ -2454,8 +2454,7 @@ xfs_agf_read_verify( !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) xfs_buf_ioerror(bp, -EFSBADCRC); else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, - XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF)) + XFS_ERRTAG_ALLOC_READ_AGF)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) @@ -2842,8 +2841,7 @@ xfs_free_extent( ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT, - XFS_RANDOM_FREE_EXTENT)) + XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_free_extent_fix_freelist(tp, agno, &agbp); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 77d9c27330ab..ef26edc2e938 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -213,6 +213,8 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **agbp); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index e1fcfe7f0a9a..cfde0a0f9706 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -253,7 +253,7 @@ xfs_allocbt_init_ptr_from_cur( ptr->s = agf->agf_roots[cur->bc_btnum]; } -STATIC __int64_t +STATIC int64_t xfs_bnobt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) @@ -261,42 +261,42 @@ xfs_bnobt_key_diff( xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; xfs_alloc_key_t *kp = &key->alloc; - return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; + return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -STATIC __int64_t +STATIC int64_t xfs_cntbt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; xfs_alloc_key_t *kp = &key->alloc; - __int64_t diff; + int64_t diff; - diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; + diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; if (diff) return diff; - return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; + return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -STATIC __int64_t +STATIC int64_t xfs_bnobt_diff_two_keys( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) - + return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - be32_to_cpu(k2->alloc.ar_startblock); } -STATIC __int64_t +STATIC int64_t xfs_cntbt_diff_two_keys( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - __int64_t diff; + int64_t diff; diff = be32_to_cpu(k1->alloc.ar_blockcount) - be32_to_cpu(k2->alloc.ar_blockcount); @@ -395,7 +395,6 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_bnobt_keys_inorder( struct xfs_btree_cur *cur, @@ -442,7 +441,6 @@ xfs_cntbt_recs_inorder( be32_to_cpu(r1->alloc.ar_startblock) < be32_to_cpu(r2->alloc.ar_startblock)); } -#endif /* DEBUG */ static const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), @@ -462,10 +460,8 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .key_diff = xfs_bnobt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, .diff_two_keys = xfs_bnobt_diff_two_keys, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, -#endif }; static const struct xfs_btree_ops xfs_cntbt_ops = { @@ -486,10 +482,8 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .key_diff = xfs_cntbt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, .diff_two_keys = xfs_cntbt_diff_two_keys, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 6622d46ddec3..ef8a1c75a467 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -114,6 +114,23 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ +/* Retrieve an extended attribute and its value. Must have iolock. */ +int +xfs_attr_get_ilocked( + struct xfs_inode *ip, + struct xfs_da_args *args) +{ + if (!xfs_inode_hasattr(ip)) + return -ENOATTR; + else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + return xfs_attr_shortform_getvalue(args); + else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + return xfs_attr_leaf_get(args); + else + return xfs_attr_node_get(args); +} + +/* Retrieve an extended attribute by name, and its value. */ int xfs_attr_get( struct xfs_inode *ip, @@ -141,14 +158,7 @@ xfs_attr_get( args.op_flags = XFS_DA_OP_OKNOENT; lock_mode = xfs_ilock_attr_map_shared(ip); - if (!xfs_inode_hasattr(ip)) - error = -ENOATTR; - else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) - error = xfs_attr_shortform_getvalue(&args); - else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) - error = xfs_attr_leaf_get(&args); - else - error = xfs_attr_node_get(&args); + error = xfs_attr_get_ilocked(ip, &args); xfs_iunlock(ip, lock_mode); *valuelenp = args.valuelen; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2852521fc8ec..c6c15e5717e4 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -351,7 +351,7 @@ xfs_attr3_leaf_read( err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d52f525f5b2d..5236d8e45146 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -253,7 +253,7 @@ xfs_attr_rmtval_copyout( xfs_ino_t ino, int *offset, int *valuelen, - __uint8_t **dst) + uint8_t **dst) { char *src = bp->b_addr; xfs_daddr_t bno = bp->b_bn; @@ -301,7 +301,7 @@ xfs_attr_rmtval_copyin( xfs_ino_t ino, int *offset, int *valuelen, - __uint8_t **src) + uint8_t **src) { char *dst = bp->b_addr; xfs_daddr_t bno = bp->b_bn; @@ -355,7 +355,7 @@ xfs_attr_rmtval_get( struct xfs_mount *mp = args->dp->i_mount; struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; - __uint8_t *dst = args->value; + uint8_t *dst = args->value; int valuelen; int nmap; int error; @@ -386,7 +386,8 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, args->trans, + mp->m_ddev_targp, dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) @@ -395,7 +396,7 @@ xfs_attr_rmtval_get( error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_buf_relse(bp); + xfs_trans_brelse(args->trans, bp); if (error) return error; @@ -421,7 +422,7 @@ xfs_attr_rmtval_set( struct xfs_bmbt_irec map; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; - __uint8_t *src = args->value; + uint8_t *src = args->value; int blkcnt; int valuelen; int nmap; diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 90928bbe693c..afd684ae3136 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -31,10 +31,10 @@ typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; * We generate this then sort it, attr_list() must return things in hash-order. */ typedef struct xfs_attr_sf_sort { - __uint8_t entno; /* entry number in original list */ - __uint8_t namelen; /* length of name value (no null) */ - __uint8_t valuelen; /* length of value */ - __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + uint8_t entno; /* entry number in original list */ + uint8_t namelen; /* length of name value (no null) */ + uint8_t valuelen; /* length of value */ + uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; @@ -42,7 +42,7 @@ typedef struct xfs_attr_sf_sort { #define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ - ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1) + ((1 << (NBBY*(int)sizeof(uint8_t))) - 1) #define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) #define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h index e1649c0d3e02..61c6b2025d0c 100644 --- a/fs/xfs/libxfs/xfs_bit.h +++ b/fs/xfs/libxfs/xfs_bit.h @@ -25,47 +25,47 @@ /* * masks with n high/low bits set, 64-bit values */ -static inline __uint64_t xfs_mask64hi(int n) +static inline uint64_t xfs_mask64hi(int n) { - return (__uint64_t)-1 << (64 - (n)); + return (uint64_t)-1 << (64 - (n)); } -static inline __uint32_t xfs_mask32lo(int n) +static inline uint32_t xfs_mask32lo(int n) { - return ((__uint32_t)1 << (n)) - 1; + return ((uint32_t)1 << (n)) - 1; } -static inline __uint64_t xfs_mask64lo(int n) +static inline uint64_t xfs_mask64lo(int n) { - return ((__uint64_t)1 << (n)) - 1; + return ((uint64_t)1 << (n)) - 1; } /* Get high bit set out of 32-bit argument, -1 if none set */ -static inline int xfs_highbit32(__uint32_t v) +static inline int xfs_highbit32(uint32_t v) { return fls(v) - 1; } /* Get high bit set out of 64-bit argument, -1 if none set */ -static inline int xfs_highbit64(__uint64_t v) +static inline int xfs_highbit64(uint64_t v) { return fls64(v) - 1; } /* Get low bit set out of 32-bit argument, -1 if none set */ -static inline int xfs_lowbit32(__uint32_t v) +static inline int xfs_lowbit32(uint32_t v) { return ffs(v) - 1; } /* Get low bit set out of 64-bit argument, -1 if none set */ -static inline int xfs_lowbit64(__uint64_t v) +static inline int xfs_lowbit64(uint64_t v) { - __uint32_t w = (__uint32_t)v; + uint32_t w = (uint32_t)v; int n = 0; if (w) { /* lower bits */ n = ffs(w); } else { /* upper bits */ - w = (__uint32_t)(v >> 32); + w = (uint32_t)(v >> 32); if (w) { n = ffs(w); if (n) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f02eb7673392..0a9880777c9c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1280,7 +1280,6 @@ xfs_bmap_read_extents( xfs_bmbt_rec_t *frp; xfs_fsblock_t nextbno; xfs_extnum_t num_recs; - xfs_extnum_t start; num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { @@ -1303,7 +1302,6 @@ xfs_bmap_read_extents( * Copy records into the extent records. */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; for (j = 0; j < num_recs; j++, i++, frp++) { xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); trp->l0 = be64_to_cpu(frp->l0); @@ -2065,8 +2063,10 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(bma->ip, temp); temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + diff = (int)(temp + temp2 - + (startblockval(PREV.br_startblock) - + (bma->cur ? + bma->cur->bc_private.b.allocated : 0))); if (diff > 0) { error = xfs_mod_fdblocks(bma->ip->i_mount, -((int64_t)diff), false); @@ -2123,7 +2123,6 @@ xfs_bmap_add_extent_delay_real( temp = da_new; if (bma->cur) temp += bma->cur->bc_private.b.allocated; - ASSERT(temp <= da_old); if (temp < da_old) xfs_mod_fdblocks(bma->ip->i_mount, (int64_t)(da_old - temp), false); @@ -3993,7 +3992,7 @@ xfs_bmapi_read( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -4474,7 +4473,7 @@ xfs_bmapi_write( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -4695,7 +4694,7 @@ xfs_bmapi_remap( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -5435,6 +5434,7 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ + xfs_fileoff_t max_len; trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); @@ -5456,6 +5456,16 @@ __xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); + /* + * Guesstimate how many blocks we can unmap without running the risk of + * blowing out the transaction with a mix of EFIs and reflink + * adjustments. + */ + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) + max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); + else + max_len = len; + if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5500,7 +5510,7 @@ __xfs_bunmapi( extno = 0; while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && - (nexts == 0 || extno < nexts)) { + (nexts == 0 || extno < nexts) && max_len > 0) { /* * Is the found extent after a hole in which bno lives? * Just back up to the previous extent, if so. @@ -5532,6 +5542,15 @@ __xfs_bunmapi( } if (del.br_startoff + del.br_blockcount > bno + 1) del.br_blockcount = bno + 1 - del.br_startoff; + + /* How much can we safely unmap? */ + if (max_len < del.br_blockcount) { + del.br_startoff += del.br_blockcount - max_len; + if (!wasdel) + del.br_startblock += del.br_blockcount - max_len; + del.br_blockcount = max_len; + } + sum = del.br_startblock + del.br_blockcount; if (isrt && (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { @@ -5708,6 +5727,7 @@ __xfs_bunmapi( if (!isrt && wasdel) xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); + max_len -= del.br_blockcount; bno = del.br_startoff - 1; nodelete: /* @@ -6078,7 +6098,7 @@ xfs_bmap_shift_extents( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmap_shift_extents", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; @@ -6230,7 +6250,7 @@ xfs_bmap_split_extent_at( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmap_split_extent_at", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; @@ -6473,33 +6493,33 @@ xfs_bmap_finish_one( int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, + xfs_filblks_t *blockcount, xfs_exntst_t state) { - int error = 0, done; + xfs_fsblock_t firstfsb; + int error = 0; trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - ip->i_ino, whichfork, startoff, blockcount, state); + ip->i_ino, whichfork, startoff, *blockcount, state); if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) return -EFSCORRUPTED; if (XFS_TEST_ERROR(false, tp->t_mountp, - XFS_ERRTAG_BMAP_FINISH_ONE, - XFS_RANDOM_BMAP_FINISH_ONE)) + XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (type) { case XFS_BMAP_MAP: - error = xfs_bmapi_remap(tp, ip, startoff, blockcount, + error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, startblock, dfops); + *blockcount = 0; break; case XFS_BMAP_UNMAP: - error = xfs_bunmapi(tp, ip, startoff, blockcount, - XFS_BMAPI_REMAP, 1, &startblock, dfops, &done); - ASSERT(done); + error = __xfs_bunmapi(tp, ip, startoff, blockcount, + XFS_BMAPI_REMAP, 1, &firstfsb, dfops); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index c35a14fa1527..851982a5dfbc 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -271,7 +271,7 @@ struct xfs_bmap_intent { int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, struct xfs_inode *ip, enum xfs_bmap_intent_type type, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, xfs_exntst_t state); + xfs_filblks_t *blockcount, xfs_exntst_t state); int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6cba69aff077..85de22513014 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -94,8 +94,8 @@ xfs_bmdr_to_bmbt( */ STATIC void __xfs_bmbt_get_all( - __uint64_t l0, - __uint64_t l1, + uint64_t l0, + uint64_t l1, xfs_bmbt_irec_t *s) { int ext_flag; @@ -573,6 +573,16 @@ xfs_bmbt_init_key_from_rec( } STATIC void +xfs_bmbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->bmbt.br_startoff = cpu_to_be64( + xfs_bmbt_disk_get_startoff(&rec->bmbt) + + xfs_bmbt_disk_get_blockcount(&rec->bmbt) - 1); +} + +STATIC void xfs_bmbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) @@ -588,15 +598,25 @@ xfs_bmbt_init_ptr_from_cur( ptr->l = 0; } -STATIC __int64_t +STATIC int64_t xfs_bmbt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { - return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) - + return (int64_t)be64_to_cpu(key->bmbt.br_startoff) - cur->bc_rec.b.br_startoff; } +STATIC int64_t +xfs_bmbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - + be64_to_cpu(k2->bmbt.br_startoff); +} + static bool xfs_bmbt_verify( struct xfs_buf *bp) @@ -687,7 +707,6 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_bmbt_keys_inorder( struct xfs_btree_cur *cur, @@ -708,7 +727,6 @@ xfs_bmbt_recs_inorder( xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= xfs_bmbt_disk_get_startoff(&r2->bmbt); } -#endif /* DEBUG */ static const struct xfs_btree_ops xfs_bmbt_ops = { .rec_len = sizeof(xfs_bmbt_rec_t), @@ -722,14 +740,14 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .get_minrecs = xfs_bmbt_get_minrecs, .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, .init_key_from_rec = xfs_bmbt_init_key_from_rec, + .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, + .diff_two_keys = xfs_bmbt_diff_two_keys, .buf_ops = &xfs_bmbt_buf_ops, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5392674bf893..4da85fff69ad 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -43,7 +43,7 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { +static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, XFS_FIBT_MAGIC, 0 }, { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, @@ -51,12 +51,12 @@ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { XFS_REFC_CRC_MAGIC } }; -__uint32_t +uint32_t xfs_btree_magic( int crc, xfs_btnum_t btnum) { - __uint32_t magic = xfs_magics[crc][btnum]; + uint32_t magic = xfs_magics[crc][btnum]; /* Ensure we asked for crc for crc-only magics. */ ASSERT(magic != 0); @@ -101,8 +101,7 @@ xfs_btree_check_lblock( be64_to_cpu(block->bb_u.l.bb_rightsib))); if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, - XFS_ERRTAG_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_LBLOCK))) { + XFS_ERRTAG_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); @@ -153,8 +152,7 @@ xfs_btree_check_sblock( block->bb_u.s.bb_rightsib; if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, - XFS_ERRTAG_BTREE_CHECK_SBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK))) { + XFS_ERRTAG_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); @@ -568,7 +566,7 @@ xfs_btree_ptr_offset( /* * Return a pointer to the n-th record in the btree block. */ -STATIC union xfs_btree_rec * +union xfs_btree_rec * xfs_btree_rec_addr( struct xfs_btree_cur *cur, int n, @@ -581,7 +579,7 @@ xfs_btree_rec_addr( /* * Return a pointer to the n-th key in the btree block. */ -STATIC union xfs_btree_key * +union xfs_btree_key * xfs_btree_key_addr( struct xfs_btree_cur *cur, int n, @@ -594,7 +592,7 @@ xfs_btree_key_addr( /* * Return a pointer to the n-th high key in the btree block. */ -STATIC union xfs_btree_key * +union xfs_btree_key * xfs_btree_high_key_addr( struct xfs_btree_cur *cur, int n, @@ -607,7 +605,7 @@ xfs_btree_high_key_addr( /* * Return a pointer to the n-th block pointer in the btree block. */ -STATIC union xfs_btree_ptr * +union xfs_btree_ptr * xfs_btree_ptr_addr( struct xfs_btree_cur *cur, int n, @@ -641,7 +639,7 @@ xfs_btree_get_iroot( * Retrieve the block pointer from the cursor at the given level. * This may be an inode btree root or from a buffer. */ -STATIC struct xfs_btree_block * /* generic btree block pointer */ +struct xfs_btree_block * /* generic btree block pointer */ xfs_btree_get_block( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in btree */ @@ -778,14 +776,14 @@ xfs_btree_lastrec( */ void xfs_btree_offsets( - __int64_t fields, /* bitmask of fields */ + int64_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ - __int64_t imask; /* mask for current bit number */ + int64_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* @@ -1756,7 +1754,7 @@ error0: return error; } -STATIC int +int xfs_btree_lookup_get_block( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in the btree */ @@ -1846,7 +1844,7 @@ xfs_btree_lookup( int *stat) /* success/failure */ { struct xfs_btree_block *block; /* current btree block */ - __int64_t diff; /* difference for the current key */ + int64_t diff; /* difference for the current key */ int error; /* error return value */ int keyno; /* current key number */ int level; /* level in the btree */ @@ -4395,7 +4393,7 @@ xfs_btree_visit_blocks( xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ - lptr = *ptr; + xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); } /* for each buffer in the level */ @@ -4435,7 +4433,7 @@ xfs_btree_visit_blocks( * recovery completion writes the changes to disk. */ struct xfs_btree_block_change_owner_info { - __uint64_t new_owner; + uint64_t new_owner; struct list_head *buffer_list; }; @@ -4481,7 +4479,7 @@ xfs_btree_block_change_owner( int xfs_btree_change_owner( struct xfs_btree_cur *cur, - __uint64_t new_owner, + uint64_t new_owner, struct list_head *buffer_list) { struct xfs_btree_block_change_owner_info bbcoi; @@ -4585,7 +4583,7 @@ xfs_btree_simple_query_range( { union xfs_btree_rec *recp; union xfs_btree_key rec_key; - __int64_t diff; + int64_t diff; int stat; bool firstrec = true; int error; @@ -4682,8 +4680,8 @@ xfs_btree_overlapped_query_range( union xfs_btree_key *hkp; union xfs_btree_rec *recp; struct xfs_btree_block *block; - __int64_t ldiff; - __int64_t hdiff; + int64_t ldiff; + int64_t hdiff; int level; struct xfs_buf *bp; int i; @@ -4849,12 +4847,14 @@ xfs_btree_query_all( xfs_btree_query_range_fn fn, void *priv) { - union xfs_btree_irec low_rec; - union xfs_btree_irec high_rec; + union xfs_btree_key low_key; + union xfs_btree_key high_key; + + memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); + memset(&low_key, 0, sizeof(low_key)); + memset(&high_key, 0xFF, sizeof(high_key)); - memset(&low_rec, 0, sizeof(low_rec)); - memset(&high_rec, 0xFF, sizeof(high_rec)); - return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv); + return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 27bed08261c5..9c95e965cfe5 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -76,7 +76,7 @@ union xfs_btree_rec { #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) #define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) -__uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); +uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. @@ -150,20 +150,19 @@ struct xfs_btree_ops { union xfs_btree_rec *rec); /* difference between key value and cursor value */ - __int64_t (*key_diff)(struct xfs_btree_cur *cur, + int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); /* * Difference between key2 and key1 -- positive if key1 > key2, * negative if key1 < key2, and zero if equal. */ - __int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, + int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, union xfs_btree_key *key1, union xfs_btree_key *key2); const struct xfs_buf_ops *buf_ops; -#if defined(DEBUG) || defined(XFS_WARN) /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, union xfs_btree_key *k1, @@ -173,7 +172,6 @@ struct xfs_btree_ops { int (*recs_inorder)(struct xfs_btree_cur *cur, union xfs_btree_rec *r1, union xfs_btree_rec *r2); -#endif }; /* @@ -213,11 +211,11 @@ typedef struct xfs_btree_cur union xfs_btree_irec bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ - __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ + uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ #define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ #define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ - __uint8_t bc_nlevels; /* number of levels in the tree */ - __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ + uint8_t bc_nlevels; /* number of levels in the tree */ + uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ union { @@ -330,7 +328,7 @@ xfs_btree_islastblock( */ void xfs_btree_offsets( - __int64_t fields, /* bitmask of fields */ + int64_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ @@ -408,7 +406,7 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); -int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner, +int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, struct list_head *buffer_list); /* @@ -434,7 +432,7 @@ static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) } static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, - __uint16_t numrecs) + uint16_t numrecs) { block->bb_numrecs = cpu_to_be16(numrecs); } @@ -506,4 +504,17 @@ int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); +union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, + struct xfs_btree_block *block); +union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n, + struct xfs_btree_block *block); +union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, + struct xfs_btree_block *block); +union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, + struct xfs_btree_block *block); +int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, + union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); +struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, + int level, struct xfs_buf **bpp); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index a416c7cb23ea..8211f48b98e6 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -1,7 +1,7 @@ #ifndef _XFS_CKSUM_H #define _XFS_CKSUM_H 1 -#define XFS_CRC_SEED (~(__uint32_t)0) +#define XFS_CRC_SEED (~(uint32_t)0) /* * Calculate the intermediate checksum for a buffer that has the CRC field @@ -9,11 +9,11 @@ * cksum_offset parameter. We do not modify the buffer during verification, * hence we have to split the CRC calculation across the cksum_offset. */ -static inline __uint32_t +static inline uint32_t xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t zero = 0; - __uint32_t crc; + uint32_t zero = 0; + uint32_t crc; /* Calculate CRC up to the checksum. */ crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); @@ -30,7 +30,7 @@ xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) * Fast CRC method where the buffer is modified. Callers must have exclusive * access to the buffer while the calculation takes place. */ -static inline __uint32_t +static inline uint32_t xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) { /* zero the CRC field */ @@ -48,7 +48,7 @@ xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) * so that it is consistent on disk. */ static inline __le32 -xfs_end_cksum(__uint32_t crc) +xfs_end_cksum(uint32_t crc) { return ~cpu_to_le32(crc); } @@ -62,7 +62,7 @@ xfs_end_cksum(__uint32_t crc) static inline void xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); + uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); } @@ -73,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) static inline int xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); + uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 1bdf2888295b..6d4335815c3f 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -263,7 +263,7 @@ xfs_da3_node_read( err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, which_fork, &xfs_da3_node_buf_ops); - if (!err && tp) { + if (!err && tp && *bpp) { struct xfs_da_blkinfo *info = (*bpp)->b_addr; int type; @@ -1282,7 +1282,7 @@ xfs_da3_fixhashpath( return; break; case XFS_DIR2_LEAFN_MAGIC: - lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count); + lasthash = xfs_dir2_leaf_lasthash(dp, blk->bp, &count); if (count == 0) return; break; @@ -1502,8 +1502,8 @@ xfs_da3_node_lookup_int( if (blk->magic == XFS_DIR2_LEAFN_MAGIC || blk->magic == XFS_DIR3_LEAFN_MAGIC) { blk->magic = XFS_DIR2_LEAFN_MAGIC; - blk->hashval = xfs_dir2_leafn_lasthash(args->dp, - blk->bp, NULL); + blk->hashval = xfs_dir2_leaf_lasthash(args->dp, + blk->bp, NULL); break; } @@ -1929,8 +1929,8 @@ xfs_da3_path_shift( blk->magic = XFS_DIR2_LEAFN_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - blk->hashval = xfs_dir2_leafn_lasthash(args->dp, - blk->bp, NULL); + blk->hashval = xfs_dir2_leaf_lasthash(args->dp, + blk->bp, NULL); break; default: ASSERT(0); @@ -1952,7 +1952,7 @@ xfs_da3_path_shift( * This is implemented with some source-level loop unrolling. */ xfs_dahash_t -xfs_da_hashname(const __uint8_t *name, int namelen) +xfs_da_hashname(const uint8_t *name, int namelen) { xfs_dahash_t hash; diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 4e29cb6a3627..ae6de17467f2 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -60,10 +60,10 @@ enum xfs_dacmp { */ typedef struct xfs_da_args { struct xfs_da_geometry *geo; /* da block geometry */ - const __uint8_t *name; /* string (maybe not NULL terminated) */ + const uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ - __uint8_t filetype; /* filetype of inode for directories */ - __uint8_t *value; /* set of bytes (maybe contain NULLs) */ + uint8_t filetype; /* filetype of inode for directories */ + uint8_t *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ int flags; /* argument flags (eg: ATTR_NOCREATE) */ xfs_dahash_t hashval; /* hash value of name */ @@ -207,7 +207,7 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); -uint xfs_da_hashname(const __uint8_t *name_string, int name_length); +uint xfs_da_hashname(const uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, const unsigned char *name, int len); diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index f1e8d4dbb600..6d77d1a8498a 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -49,7 +49,7 @@ xfs_dir3_sf_entsize( struct xfs_dir2_sf_hdr *hdr, int len) { - return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t); + return xfs_dir2_sf_entsize(hdr, len) + sizeof(uint8_t); } static struct xfs_dir2_sf_entry * @@ -77,7 +77,7 @@ xfs_dir3_sf_nextentry( * not necessary. For non-filetype enable directories, the type is always * unknown and we never store the value. */ -static __uint8_t +static uint8_t xfs_dir2_sfe_get_ftype( struct xfs_dir2_sf_entry *sfep) { @@ -87,16 +87,16 @@ xfs_dir2_sfe_get_ftype( static void xfs_dir2_sfe_put_ftype( struct xfs_dir2_sf_entry *sfep, - __uint8_t ftype) + uint8_t ftype) { ASSERT(ftype < XFS_DIR3_FT_MAX); } -static __uint8_t +static uint8_t xfs_dir3_sfe_get_ftype( struct xfs_dir2_sf_entry *sfep) { - __uint8_t ftype; + uint8_t ftype; ftype = sfep->name[sfep->namelen]; if (ftype >= XFS_DIR3_FT_MAX) @@ -107,7 +107,7 @@ xfs_dir3_sfe_get_ftype( static void xfs_dir3_sfe_put_ftype( struct xfs_dir2_sf_entry *sfep, - __uint8_t ftype) + uint8_t ftype) { ASSERT(ftype < XFS_DIR3_FT_MAX); @@ -124,7 +124,7 @@ xfs_dir3_sfe_put_ftype( static xfs_ino_t xfs_dir2_sf_get_ino( struct xfs_dir2_sf_hdr *hdr, - __uint8_t *from) + uint8_t *from) { if (hdr->i8count) return get_unaligned_be64(from) & 0x00ffffffffffffffULL; @@ -135,7 +135,7 @@ xfs_dir2_sf_get_ino( static void xfs_dir2_sf_put_ino( struct xfs_dir2_sf_hdr *hdr, - __uint8_t *to, + uint8_t *to, xfs_ino_t ino) { ASSERT((ino & 0xff00000000000000ULL) == 0); @@ -225,7 +225,7 @@ xfs_dir3_sfe_put_ino( #define XFS_DIR3_DATA_ENTSIZE(n) \ round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ - sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \ + sizeof(xfs_dir2_data_off_t) + sizeof(uint8_t)), \ XFS_DIR2_DATA_ALIGN) static int @@ -242,7 +242,7 @@ xfs_dir3_data_entsize( return XFS_DIR3_DATA_ENTSIZE(n); } -static __uint8_t +static uint8_t xfs_dir2_data_get_ftype( struct xfs_dir2_data_entry *dep) { @@ -252,16 +252,16 @@ xfs_dir2_data_get_ftype( static void xfs_dir2_data_put_ftype( struct xfs_dir2_data_entry *dep, - __uint8_t ftype) + uint8_t ftype) { ASSERT(ftype < XFS_DIR3_FT_MAX); } -static __uint8_t +static uint8_t xfs_dir3_data_get_ftype( struct xfs_dir2_data_entry *dep) { - __uint8_t ftype = dep->name[dep->namelen]; + uint8_t ftype = dep->name[dep->namelen]; if (ftype >= XFS_DIR3_FT_MAX) return XFS_DIR3_FT_UNKNOWN; @@ -271,7 +271,7 @@ xfs_dir3_data_get_ftype( static void xfs_dir3_data_put_ftype( struct xfs_dir2_data_entry *dep, - __uint8_t type) + uint8_t type) { ASSERT(type < XFS_DIR3_FT_MAX); ASSERT(dep->namelen != 0); diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 9a492a9e19bd..3771edcb301d 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -111,11 +111,11 @@ struct xfs_da3_intnode { * appropriate. */ struct xfs_da3_icnode_hdr { - __uint32_t forw; - __uint32_t back; - __uint16_t magic; - __uint16_t count; - __uint16_t level; + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t level; }; /* @@ -187,14 +187,14 @@ struct xfs_da3_icnode_hdr { /* * Byte offset in data block and shortform entry. */ -typedef __uint16_t xfs_dir2_data_off_t; +typedef uint16_t xfs_dir2_data_off_t; #define NULLDATAOFF 0xffffU typedef uint xfs_dir2_data_aoff_t; /* argument form */ /* * Offset in data space of a data entry. */ -typedef __uint32_t xfs_dir2_dataptr_t; +typedef uint32_t xfs_dir2_dataptr_t; #define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) #define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) @@ -206,7 +206,7 @@ typedef xfs_off_t xfs_dir2_off_t; /* * Directory block number (logical dirblk in file) */ -typedef __uint32_t xfs_dir2_db_t; +typedef uint32_t xfs_dir2_db_t; #define XFS_INO32_SIZE 4 #define XFS_INO64_SIZE 8 @@ -226,9 +226,9 @@ typedef __uint32_t xfs_dir2_db_t; * over them. */ typedef struct xfs_dir2_sf_hdr { - __uint8_t count; /* count of entries */ - __uint8_t i8count; /* count of 8-byte inode #s */ - __uint8_t parent[8]; /* parent dir inode number */ + uint8_t count; /* count of entries */ + uint8_t i8count; /* count of 8-byte inode #s */ + uint8_t parent[8]; /* parent dir inode number */ } __packed xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { @@ -447,11 +447,11 @@ struct xfs_dir3_leaf_hdr { }; struct xfs_dir3_icleaf_hdr { - __uint32_t forw; - __uint32_t back; - __uint16_t magic; - __uint16_t count; - __uint16_t stale; + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t stale; }; /* @@ -538,10 +538,10 @@ struct xfs_dir3_free { * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. */ struct xfs_dir3_icfree_hdr { - __uint32_t magic; - __uint32_t firstdb; - __uint32_t nvalid; - __uint32_t nused; + uint32_t magic; + uint32_t firstdb; + uint32_t nvalid; + uint32_t nused; }; @@ -632,10 +632,10 @@ typedef struct xfs_attr_shortform { __u8 padding; } hdr; struct xfs_attr_sf_entry { - __uint8_t namelen; /* actual length of name (no NULL) */ - __uint8_t valuelen; /* actual length of value (no NULL) */ - __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - __uint8_t nameval[1]; /* name & value bytes concatenated */ + uint8_t namelen; /* actual length of name (no NULL) */ + uint8_t valuelen; /* actual length of value (no NULL) */ + uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + uint8_t nameval[1]; /* name & value bytes concatenated */ } list[1]; /* variable sized array */ } xfs_attr_shortform_t; @@ -725,22 +725,22 @@ struct xfs_attr3_leafblock { * incore, neutral version of the attribute leaf header */ struct xfs_attr3_icleaf_hdr { - __uint32_t forw; - __uint32_t back; - __uint16_t magic; - __uint16_t count; - __uint16_t usedbytes; + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t usedbytes; /* * firstused is 32-bit here instead of 16-bit like the on-disk variant * to support maximum fsb size of 64k without overflow issues throughout * the attr code. Instead, the overflow condition is handled on * conversion to/from disk. */ - __uint32_t firstused; + uint32_t firstused; __u8 holes; struct { - __uint16_t base; - __uint16_t size; + uint16_t base; + uint16_t size; } freemap[XFS_ATTR_LEAF_MAPSIZE]; }; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 2f389d366e93..ccf9783fd3f0 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -218,8 +218,7 @@ xfs_dir_ino_validate( agblkno != 0 && ioff < (1 << mp->m_sb.sb_inopblog) && XFS_AGINO_TO_INO(mp, agno, agino) == ino; - if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, - XFS_RANDOM_DIR_INO_VALIDATE))) { + if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index d6e6d9d16f6c..21c8f8bf94d5 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -47,9 +47,9 @@ struct xfs_dir_ops { struct xfs_dir2_sf_entry * (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep); - __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); + uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, - __uint8_t ftype); + uint8_t ftype); xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep); void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, @@ -60,9 +60,9 @@ struct xfs_dir_ops { xfs_ino_t ino); int (*data_entsize)(int len); - __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); + uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, - __uint8_t ftype); + uint8_t ftype); __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); struct xfs_dir2_data_free * (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index aa17cb788946..43c902f7a68d 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -139,7 +139,7 @@ xfs_dir3_block_read( err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index b887fb2a2bcf..27297a689d9c 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -145,7 +145,7 @@ xfs_dir3_leaf_check_int( static bool xfs_dir3_leaf_verify( struct xfs_buf *bp, - __uint16_t magic) + uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; @@ -154,7 +154,7 @@ xfs_dir3_leaf_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - __uint16_t magic3; + uint16_t magic3; magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC : XFS_DIR3_LEAFN_MAGIC; @@ -178,7 +178,7 @@ xfs_dir3_leaf_verify( static void __read_verify( struct xfs_buf *bp, - __uint16_t magic) + uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -195,7 +195,7 @@ __read_verify( static void __write_verify( struct xfs_buf *bp, - __uint16_t magic) + uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_fspriv; @@ -256,7 +256,7 @@ const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .verify_write = xfs_dir3_leafn_write_verify, }; -static int +int xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, @@ -268,7 +268,7 @@ xfs_dir3_leaf_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); return err; } @@ -285,7 +285,7 @@ xfs_dir3_leafn_read( err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); return err; } @@ -299,7 +299,7 @@ xfs_dir3_leaf_init( struct xfs_trans *tp, struct xfs_buf *bp, xfs_ino_t owner, - __uint16_t type) + uint16_t type) { struct xfs_dir2_leaf *leaf = bp->b_addr; @@ -343,7 +343,7 @@ xfs_dir3_leaf_get_buf( xfs_da_args_t *args, xfs_dir2_db_t bno, struct xfs_buf **bpp, - __uint16_t magic) + uint16_t magic) { struct xfs_inode *dp = args->dp; struct xfs_trans *tp = args->trans; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index bbd1238852b3..682e2bf370c7 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -528,7 +528,7 @@ xfs_dir2_free_hdr_check( * Stale entries are ok. */ xfs_dahash_t /* hash value */ -xfs_dir2_leafn_lasthash( +xfs_dir2_leaf_lasthash( struct xfs_inode *dp, struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ @@ -540,7 +540,9 @@ xfs_dir2_leafn_lasthash( dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || - leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); if (count) *count = leafhdr.count; @@ -1405,8 +1407,8 @@ xfs_dir2_leafn_split( /* * Update last hashval in each block since we added the name. */ - oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL); - newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL); + oldblk->hashval = xfs_dir2_leaf_lasthash(dp, oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leaf_lasthash(dp, newblk->bp, NULL); xfs_dir3_leaf_check(dp, oldblk->bp); xfs_dir3_leaf_check(dp, newblk->bp); return error; diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 39f8604f764e..4badd26c47e6 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -58,6 +58,8 @@ extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); /* xfs_dir2_leaf.c */ +extern int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, @@ -69,7 +71,7 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_dir2_leaf_entry *ents, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_buf **bpp, __uint16_t magic); + struct xfs_buf **bpp, uint16_t magic); extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, struct xfs_buf *bp, int first, int last); extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, @@ -93,7 +95,7 @@ extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp, +extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp, struct xfs_buf *bp, int *count); extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, struct xfs_da_args *args, int *indexp, @@ -128,7 +130,7 @@ extern int xfs_dir2_sf_replace(struct xfs_da_args *args); extern int xfs_dir2_sf_verify(struct xfs_inode *ip); /* xfs_dir2_readdir.c */ -extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, - size_t bufsize); +extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, + struct dir_context *ctx, size_t bufsize); #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index e84af093b2ab..be8b9755f66a 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -647,7 +647,7 @@ xfs_dir2_sf_verify( int offset; int size; int error; - __uint8_t filetype; + uint8_t filetype; ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index a1dccd8d96bc..23229f0c5b15 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -103,8 +103,8 @@ struct xfs_ifork; * Must be padded to 64 bit alignment. */ typedef struct xfs_sb { - __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ - __uint32_t sb_blocksize; /* logical block size, bytes */ + uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ + uint32_t sb_blocksize; /* logical block size, bytes */ xfs_rfsblock_t sb_dblocks; /* number of data blocks */ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ xfs_rtblock_t sb_rextents; /* number of realtime extents */ @@ -118,45 +118,45 @@ typedef struct xfs_sb { xfs_agnumber_t sb_agcount; /* number of allocation groups */ xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ xfs_extlen_t sb_logblocks; /* number of log blocks */ - __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ - __uint16_t sb_sectsize; /* volume sector size, bytes */ - __uint16_t sb_inodesize; /* inode size, bytes */ - __uint16_t sb_inopblock; /* inodes per block */ + uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ + uint16_t sb_sectsize; /* volume sector size, bytes */ + uint16_t sb_inodesize; /* inode size, bytes */ + uint16_t sb_inopblock; /* inodes per block */ char sb_fname[12]; /* file system name */ - __uint8_t sb_blocklog; /* log2 of sb_blocksize */ - __uint8_t sb_sectlog; /* log2 of sb_sectsize */ - __uint8_t sb_inodelog; /* log2 of sb_inodesize */ - __uint8_t sb_inopblog; /* log2 of sb_inopblock */ - __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ - __uint8_t sb_rextslog; /* log2 of sb_rextents */ - __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ - __uint8_t sb_imax_pct; /* max % of fs for inode space */ + uint8_t sb_blocklog; /* log2 of sb_blocksize */ + uint8_t sb_sectlog; /* log2 of sb_sectsize */ + uint8_t sb_inodelog; /* log2 of sb_inodesize */ + uint8_t sb_inopblog; /* log2 of sb_inopblock */ + uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + uint8_t sb_rextslog; /* log2 of sb_rextents */ + uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ + uint8_t sb_imax_pct; /* max % of fs for inode space */ /* statistics */ /* * These fields must remain contiguous. If you really * want to change their layout, make sure you fix the * code in xfs_trans_apply_sb_deltas(). */ - __uint64_t sb_icount; /* allocated inodes */ - __uint64_t sb_ifree; /* free inodes */ - __uint64_t sb_fdblocks; /* free data blocks */ - __uint64_t sb_frextents; /* free realtime extents */ + uint64_t sb_icount; /* allocated inodes */ + uint64_t sb_ifree; /* free inodes */ + uint64_t sb_fdblocks; /* free data blocks */ + uint64_t sb_frextents; /* free realtime extents */ /* * End contiguous fields. */ xfs_ino_t sb_uquotino; /* user quota inode */ xfs_ino_t sb_gquotino; /* group quota inode */ - __uint16_t sb_qflags; /* quota flags */ - __uint8_t sb_flags; /* misc. flags */ - __uint8_t sb_shared_vn; /* shared version number */ + uint16_t sb_qflags; /* quota flags */ + uint8_t sb_flags; /* misc. flags */ + uint8_t sb_shared_vn; /* shared version number */ xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ - __uint32_t sb_unit; /* stripe or raid unit */ - __uint32_t sb_width; /* stripe or raid width */ - __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ - __uint8_t sb_logsectlog; /* log2 of the log sector size */ - __uint16_t sb_logsectsize; /* sector size for the log, bytes */ - __uint32_t sb_logsunit; /* stripe unit size for the log */ - __uint32_t sb_features2; /* additional feature bits */ + uint32_t sb_unit; /* stripe or raid unit */ + uint32_t sb_width; /* stripe or raid width */ + uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ + uint8_t sb_logsectlog; /* log2 of the log sector size */ + uint16_t sb_logsectsize; /* sector size for the log, bytes */ + uint32_t sb_logsunit; /* stripe unit size for the log */ + uint32_t sb_features2; /* additional feature bits */ /* * bad features2 field as a result of failing to pad the sb structure to @@ -167,17 +167,17 @@ typedef struct xfs_sb { * the value in sb_features2 when formatting the incore superblock to * the disk buffer. */ - __uint32_t sb_bad_features2; + uint32_t sb_bad_features2; /* version 5 superblock fields start here */ /* feature masks */ - __uint32_t sb_features_compat; - __uint32_t sb_features_ro_compat; - __uint32_t sb_features_incompat; - __uint32_t sb_features_log_incompat; + uint32_t sb_features_compat; + uint32_t sb_features_ro_compat; + uint32_t sb_features_incompat; + uint32_t sb_features_log_incompat; - __uint32_t sb_crc; /* superblock crc */ + uint32_t sb_crc; /* superblock crc */ xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ xfs_ino_t sb_pquotino; /* project quota inode */ @@ -449,7 +449,7 @@ static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) static inline bool xfs_sb_has_compat_feature( struct xfs_sb *sbp, - __uint32_t feature) + uint32_t feature) { return (sbp->sb_features_compat & feature) != 0; } @@ -465,7 +465,7 @@ xfs_sb_has_compat_feature( static inline bool xfs_sb_has_ro_compat_feature( struct xfs_sb *sbp, - __uint32_t feature) + uint32_t feature) { return (sbp->sb_features_ro_compat & feature) != 0; } @@ -482,7 +482,7 @@ xfs_sb_has_ro_compat_feature( static inline bool xfs_sb_has_incompat_feature( struct xfs_sb *sbp, - __uint32_t feature) + uint32_t feature) { return (sbp->sb_features_incompat & feature) != 0; } @@ -492,7 +492,7 @@ xfs_sb_has_incompat_feature( static inline bool xfs_sb_has_incompat_log_feature( struct xfs_sb *sbp, - __uint32_t feature) + uint32_t feature) { return (sbp->sb_features_log_incompat & feature) != 0; } @@ -594,8 +594,8 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) */ #define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) #define XFS_B_TO_FSB(mp,b) \ - ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) -#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) + ((((uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSBT(mp,b) (((uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) #define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) /* @@ -1072,7 +1072,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * next agno_log bits - ag number * high agno_log-agblklog-inopblog bits - 0 */ -#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) +#define XFS_INO_MASK(k) (uint32_t)((1ULL << (k)) - 1) #define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog #define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog #define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log @@ -1211,6 +1211,7 @@ struct xfs_dsymlink_hdr { #define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc) +#define XFS_SYMLINK_MAXLEN 1024 /* * The maximum pathlen is 1024 bytes. Since the minimum file system * blocksize is 512 bytes, we can get a max of 3 extents back from @@ -1269,16 +1270,16 @@ typedef __be32 xfs_alloc_ptr_t; #define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */ #define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */ -typedef __uint64_t xfs_inofree_t; +typedef uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) #define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) #define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ -#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) +#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(uint16_t)) #define XFS_INODES_PER_HOLEMASK_BIT \ - (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(uint16_t))) static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { @@ -1312,9 +1313,9 @@ typedef struct xfs_inobt_rec { typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ - __uint16_t ir_holemask; /* hole mask for sparse chunks */ - __uint8_t ir_count; /* total inode count */ - __uint8_t ir_freecount; /* count of free inodes (set bits) */ + uint16_t ir_holemask; /* hole mask for sparse chunks */ + uint8_t ir_count; /* total inode count */ + uint8_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ } xfs_inobt_rec_incore_t; @@ -1397,15 +1398,15 @@ struct xfs_rmap_rec { * rm_offset:54-60 aren't used and should be zero * rm_offset:0-53 is the block offset within the inode */ -#define XFS_RMAP_OFF_ATTR_FORK ((__uint64_t)1ULL << 63) -#define XFS_RMAP_OFF_BMBT_BLOCK ((__uint64_t)1ULL << 62) -#define XFS_RMAP_OFF_UNWRITTEN ((__uint64_t)1ULL << 61) +#define XFS_RMAP_OFF_ATTR_FORK ((uint64_t)1ULL << 63) +#define XFS_RMAP_OFF_BMBT_BLOCK ((uint64_t)1ULL << 62) +#define XFS_RMAP_OFF_UNWRITTEN ((uint64_t)1ULL << 61) -#define XFS_RMAP_LEN_MAX ((__uint32_t)~0U) +#define XFS_RMAP_LEN_MAX ((uint32_t)~0U) #define XFS_RMAP_OFF_FLAGS (XFS_RMAP_OFF_ATTR_FORK | \ XFS_RMAP_OFF_BMBT_BLOCK | \ XFS_RMAP_OFF_UNWRITTEN) -#define XFS_RMAP_OFF_MASK ((__uint64_t)0x3FFFFFFFFFFFFFULL) +#define XFS_RMAP_OFF_MASK ((uint64_t)0x3FFFFFFFFFFFFFULL) #define XFS_RMAP_OFF(off) ((off) & XFS_RMAP_OFF_MASK) @@ -1431,8 +1432,8 @@ struct xfs_rmap_rec { struct xfs_rmap_irec { xfs_agblock_t rm_startblock; /* extent start block */ xfs_extlen_t rm_blockcount; /* extent length */ - __uint64_t rm_owner; /* extent owner */ - __uint64_t rm_offset; /* offset within the owner */ + uint64_t rm_owner; /* extent owner */ + uint64_t rm_offset; /* offset within the owner */ unsigned int rm_flags; /* state flags */ }; @@ -1544,11 +1545,11 @@ typedef struct xfs_bmbt_rec { __be64 l0, l1; } xfs_bmbt_rec_t; -typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; typedef struct xfs_bmbt_rec_host { - __uint64_t l0, l1; + uint64_t l0, l1; } xfs_bmbt_rec_host_t; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 095bdf049a3f..8c61f21535d4 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -302,10 +302,10 @@ typedef struct xfs_bstat { * and using two 16bit values to hold new 32bit projid was choosen * to retain compatibility with "old" filesystems). */ -static inline __uint32_t +static inline uint32_t bstat_get_projid(struct xfs_bstat *bs) { - return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; + return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; } /* @@ -446,19 +446,15 @@ typedef struct xfs_handle { } xfs_handle_t; #define ha_fsid ha_u._ha_fsid -#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \ - - (char *) &(handle)) \ - + (handle).ha_fid.fid_len) - /* * Structure passed to XFS_IOC_SWAPEXT */ typedef struct xfs_swapext { - __int64_t sx_version; /* version */ + int64_t sx_version; /* version */ #define XFS_SX_VERSION 0 - __int64_t sx_fdtarget; /* fd of target file */ - __int64_t sx_fdtmp; /* fd of tmp file */ + int64_t sx_fdtarget; /* fd of target file */ + int64_t sx_fdtmp; /* fd of tmp file */ xfs_off_t sx_offset; /* offset into file */ xfs_off_t sx_length; /* leng from offset */ char sx_pad[16]; /* pad space, unused */ @@ -546,7 +542,7 @@ typedef struct xfs_swapext #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) -#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t) +#define XFS_IOC_GOINGDOWN _IOR ('X', 125, uint32_t) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d41ade5d293e..ffd5a15d1bb6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -46,7 +46,7 @@ /* * Allocation group level functions. */ -static inline int +int xfs_ialloc_cluster_alignment( struct xfs_mount *mp) { @@ -98,24 +98,15 @@ xfs_inobt_update( return xfs_btree_update(cur, &rec); } -/* - * Get the data from the pointed-to record. - */ -int /* error */ -xfs_inobt_get_rec( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_inobt_rec_incore_t *irec, /* btree record */ - int *stat) /* output: success/failure */ +/* Convert on-disk btree record to incore inobt record. */ +void +xfs_inobt_btrec_to_irec( + struct xfs_mount *mp, + union xfs_btree_rec *rec, + struct xfs_inobt_rec_incore *irec) { - union xfs_btree_rec *rec; - int error; - - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || *stat == 0) - return error; - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) { irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); irec->ir_count = rec->inobt.ir_u.sp.ir_count; irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; @@ -130,6 +121,25 @@ xfs_inobt_get_rec( be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } irec->ir_free = be64_to_cpu(rec->inobt.ir_free); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || *stat == 0) + return error; + + xfs_inobt_btrec_to_irec(cur->bc_mp, rec, irec); return 0; } @@ -140,9 +150,9 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, - __uint16_t holemask, - __uint8_t count, - __int32_t freecount, + uint16_t holemask, + uint8_t count, + int32_t freecount, xfs_inofree_t free, int *stat) { @@ -2542,8 +2552,7 @@ xfs_agi_read_verify( !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) xfs_buf_ioerror(bp, -EFSBADCRC); else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, - XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI)) + XFS_ERRTAG_IALLOC_READ_AGI)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 0bb89669fc07..b32cfb5aeb5b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -168,5 +168,10 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +union xfs_btree_rec; +void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, + struct xfs_inobt_rec_incore *irec); + +int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 7c471881c9a6..317caba9faa6 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -175,6 +175,18 @@ xfs_inobt_init_key_from_rec( } STATIC void +xfs_inobt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->inobt.ir_startino); + x += XFS_INODES_PER_CHUNK - 1; + key->inobt.ir_startino = cpu_to_be32(x); +} + +STATIC void xfs_inobt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) @@ -219,15 +231,25 @@ xfs_finobt_init_ptr_from_cur( ptr->s = agi->agi_free_root; } -STATIC __int64_t +STATIC int64_t xfs_inobt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { - return (__int64_t)be32_to_cpu(key->inobt.ir_startino) - + return (int64_t)be32_to_cpu(key->inobt.ir_startino) - cur->bc_rec.i.ir_startino; } +STATIC int64_t +xfs_inobt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - + be32_to_cpu(k2->inobt.ir_startino); +} + static int xfs_inobt_verify( struct xfs_buf *bp) @@ -302,7 +324,6 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .verify_write = xfs_inobt_write_verify, }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_inobt_keys_inorder( struct xfs_btree_cur *cur, @@ -322,7 +343,6 @@ xfs_inobt_recs_inorder( return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= be32_to_cpu(r2->inobt.ir_startino); } -#endif /* DEBUG */ static const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), @@ -335,14 +355,14 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, -#if defined(DEBUG) || defined(XFS_WARN) + .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, -#endif }; static const struct xfs_btree_ops xfs_finobt_ops = { @@ -356,14 +376,14 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, -#if defined(DEBUG) || defined(XFS_WARN) + .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 09c3d1aecef2..378f8fbc91a7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -105,8 +105,7 @@ xfs_inode_buf_verify( di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && xfs_dinode_good_version(mp, dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { + XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); @@ -381,7 +380,7 @@ xfs_log_dinode_to_disk( } } -static bool +bool xfs_dinode_verify( struct xfs_mount *mp, xfs_ino_t ino, @@ -444,7 +443,7 @@ xfs_dinode_calc_crc( struct xfs_mount *mp, struct xfs_dinode *dip) { - __uint32_t crc; + uint32_t crc; if (dip->di_version < 3) return; diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 6848a0afbce7..a9c97a356c30 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -28,26 +28,26 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - __int8_t di_version; /* inode version */ - __int8_t di_format; /* format of di_c data */ - __uint16_t di_flushiter; /* incremented on flush */ - __uint32_t di_uid; /* owner's user id */ - __uint32_t di_gid; /* owner's group id */ - __uint16_t di_projid_lo; /* lower part of owner's project id */ - __uint16_t di_projid_hi; /* higher part of owner's project id */ + int8_t di_version; /* inode version */ + int8_t di_format; /* format of di_c data */ + uint16_t di_flushiter; /* incremented on flush */ + uint32_t di_uid; /* owner's user id */ + uint32_t di_gid; /* owner's group id */ + uint16_t di_projid_lo; /* lower part of owner's project id */ + uint16_t di_projid_hi; /* higher part of owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ xfs_extnum_t di_nextents; /* number of extents in data fork */ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - __int8_t di_aformat; /* format of attr fork's data */ - __uint32_t di_dmevmask; /* DMIG event mask */ - __uint16_t di_dmstate; /* DMIG state info */ - __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + int8_t di_aformat; /* format of attr fork's data */ + uint32_t di_dmevmask; /* DMIG event mask */ + uint16_t di_dmstate; /* DMIG state info */ + uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - __uint64_t di_flags2; /* more random flags */ - __uint32_t di_cowextsize; /* basic cow extent size for file */ + uint64_t di_flags2; /* more random flags */ + uint32_t di_cowextsize; /* basic cow extent size for file */ xfs_ictimestamp_t di_crtime; /* time created */ }; @@ -82,4 +82,7 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ +bool xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, + struct xfs_dinode *dip); + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 7ae571f8e34a..8372e9bcd7b6 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -31,7 +31,7 @@ struct xfs_trans_res; * through all the log items definitions and everything they encode into the * log. */ -typedef __uint32_t xlog_tid_t; +typedef uint32_t xlog_tid_t; #define XLOG_MIN_ICLOGS 2 #define XLOG_MAX_ICLOGS 8 @@ -211,7 +211,7 @@ typedef struct xfs_log_iovec { typedef struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ - __int32_t th_tid; /* transaction id (unused) */ + int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ } xfs_trans_header_t; @@ -265,52 +265,52 @@ typedef struct xfs_trans_header { * must be added on to the end. */ typedef struct xfs_inode_log_format { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ + uint16_t ilf_type; /* inode log item type */ + uint16_t ilf_size; /* size of this item */ + uint32_t ilf_fields; /* flags for fields logged */ + uint16_t ilf_asize; /* size of attr d/ext/root */ + uint16_t ilf_dsize; /* size of data/ext/root */ + uint64_t ilf_ino; /* inode number */ union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uint32_t ilfu_rdev; /* rdev value for dev inode*/ uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ + int64_t ilf_blkno; /* blkno of inode buffer */ + int32_t ilf_len; /* len of inode buffer */ + int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; typedef struct xfs_inode_log_format_32 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ + uint16_t ilf_type; /* inode log item type */ + uint16_t ilf_size; /* size of this item */ + uint32_t ilf_fields; /* flags for fields logged */ + uint16_t ilf_asize; /* size of attr d/ext/root */ + uint16_t ilf_dsize; /* size of data/ext/root */ + uint64_t ilf_ino; /* inode number */ union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uint32_t ilfu_rdev; /* rdev value for dev inode*/ uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ + int64_t ilf_blkno; /* blkno of inode buffer */ + int32_t ilf_len; /* len of inode buffer */ + int32_t ilf_boffset; /* off of inode in buffer */ } __attribute__((packed)) xfs_inode_log_format_32_t; typedef struct xfs_inode_log_format_64 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint32_t ilf_pad; /* pad for 64 bit boundary */ - __uint64_t ilf_ino; /* inode number */ + uint16_t ilf_type; /* inode log item type */ + uint16_t ilf_size; /* size of this item */ + uint32_t ilf_fields; /* flags for fields logged */ + uint16_t ilf_asize; /* size of attr d/ext/root */ + uint16_t ilf_dsize; /* size of data/ext/root */ + uint32_t ilf_pad; /* pad for 64 bit boundary */ + uint64_t ilf_ino; /* inode number */ union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uint32_t ilfu_rdev; /* rdev value for dev inode*/ uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ + int64_t ilf_blkno; /* blkno of inode buffer */ + int32_t ilf_len; /* len of inode buffer */ + int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_64_t; @@ -379,8 +379,8 @@ static inline int xfs_ilog_fdata(int w) * information. */ typedef struct xfs_ictimestamp { - __int32_t t_sec; /* timestamp seconds */ - __int32_t t_nsec; /* timestamp nanoseconds */ + int32_t t_sec; /* timestamp seconds */ + int32_t t_nsec; /* timestamp nanoseconds */ } xfs_ictimestamp_t; /* @@ -388,18 +388,18 @@ typedef struct xfs_ictimestamp { * kept identical to struct xfs_dinode except for the endianness annotations. */ struct xfs_log_dinode { - __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ - __uint16_t di_mode; /* mode and type of file */ - __int8_t di_version; /* inode version */ - __int8_t di_format; /* format of di_c data */ - __uint8_t di_pad3[2]; /* unused in v2/3 inodes */ - __uint32_t di_uid; /* owner's user id */ - __uint32_t di_gid; /* owner's group id */ - __uint32_t di_nlink; /* number of links to file */ - __uint16_t di_projid_lo; /* lower part of owner's project id */ - __uint16_t di_projid_hi; /* higher part of owner's project id */ - __uint8_t di_pad[6]; /* unused, zeroed space */ - __uint16_t di_flushiter; /* incremented on flush */ + uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + uint16_t di_mode; /* mode and type of file */ + int8_t di_version; /* inode version */ + int8_t di_format; /* format of di_c data */ + uint8_t di_pad3[2]; /* unused in v2/3 inodes */ + uint32_t di_uid; /* owner's user id */ + uint32_t di_gid; /* owner's group id */ + uint32_t di_nlink; /* number of links to file */ + uint16_t di_projid_lo; /* lower part of owner's project id */ + uint16_t di_projid_hi; /* higher part of owner's project id */ + uint8_t di_pad[6]; /* unused, zeroed space */ + uint16_t di_flushiter; /* incremented on flush */ xfs_ictimestamp_t di_atime; /* time last accessed */ xfs_ictimestamp_t di_mtime; /* time last modified */ xfs_ictimestamp_t di_ctime; /* time created/inode modified */ @@ -408,23 +408,23 @@ struct xfs_log_dinode { xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ xfs_extnum_t di_nextents; /* number of extents in data fork */ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - __int8_t di_aformat; /* format of attr fork's data */ - __uint32_t di_dmevmask; /* DMIG event mask */ - __uint16_t di_dmstate; /* DMIG state info */ - __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - __uint32_t di_gen; /* generation number */ + uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + int8_t di_aformat; /* format of attr fork's data */ + uint32_t di_dmevmask; /* DMIG event mask */ + uint16_t di_dmstate; /* DMIG state info */ + uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + uint32_t di_gen; /* generation number */ /* di_next_unlinked is the only non-core field in the old dinode */ xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ /* start of the extended dinode, writable fields */ - __uint32_t di_crc; /* CRC of the inode */ - __uint64_t di_changecount; /* number of attribute changes */ + uint32_t di_crc; /* CRC of the inode */ + uint64_t di_changecount; /* number of attribute changes */ xfs_lsn_t di_lsn; /* flush sequence */ - __uint64_t di_flags2; /* more random flags */ - __uint32_t di_cowextsize; /* basic cow extent size for file */ - __uint8_t di_pad2[12]; /* more padding for future expansion */ + uint64_t di_flags2; /* more random flags */ + uint32_t di_cowextsize; /* basic cow extent size for file */ + uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_ictimestamp_t di_crtime; /* time created */ @@ -483,7 +483,7 @@ typedef struct xfs_buf_log_format { unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ unsigned short blf_len; /* number of blocks in this buf */ - __int64_t blf_blkno; /* starting blkno of this buf */ + int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ } xfs_buf_log_format_t; @@ -533,7 +533,7 @@ xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); } -static inline __uint16_t +static inline uint16_t xfs_blft_from_flags(struct xfs_buf_log_format *blf) { return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; @@ -554,14 +554,14 @@ typedef struct xfs_extent { * conversion routine. */ typedef struct xfs_extent_32 { - __uint64_t ext_start; - __uint32_t ext_len; + uint64_t ext_start; + uint32_t ext_len; } __attribute__((packed)) xfs_extent_32_t; typedef struct xfs_extent_64 { - __uint64_t ext_start; - __uint32_t ext_len; - __uint32_t ext_pad; + uint64_t ext_start; + uint32_t ext_len; + uint32_t ext_pad; } xfs_extent_64_t; /* @@ -570,26 +570,26 @@ typedef struct xfs_extent_64 { * size is given by efi_nextents. */ typedef struct xfs_efi_log_format { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ + uint16_t efi_type; /* efi log item type */ + uint16_t efi_size; /* size of this item */ + uint32_t efi_nextents; /* # extents to free */ + uint64_t efi_id; /* efi identifier */ xfs_extent_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_t; typedef struct xfs_efi_log_format_32 { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ + uint16_t efi_type; /* efi log item type */ + uint16_t efi_size; /* size of this item */ + uint32_t efi_nextents; /* # extents to free */ + uint64_t efi_id; /* efi identifier */ xfs_extent_32_t efi_extents[1]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; typedef struct xfs_efi_log_format_64 { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ + uint16_t efi_type; /* efi log item type */ + uint16_t efi_size; /* size of this item */ + uint32_t efi_nextents; /* # extents to free */ + uint64_t efi_id; /* efi identifier */ xfs_extent_64_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_64_t; @@ -599,26 +599,26 @@ typedef struct xfs_efi_log_format_64 { * size is given by efd_nextents; */ typedef struct xfs_efd_log_format { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ + uint16_t efd_type; /* efd log item type */ + uint16_t efd_size; /* size of this item */ + uint32_t efd_nextents; /* # of extents freed */ + uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_t; typedef struct xfs_efd_log_format_32 { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ + uint16_t efd_type; /* efd log item type */ + uint16_t efd_size; /* size of this item */ + uint32_t efd_nextents; /* # of extents freed */ + uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_32_t efd_extents[1]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; typedef struct xfs_efd_log_format_64 { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ + uint16_t efd_type; /* efd log item type */ + uint16_t efd_size; /* size of this item */ + uint32_t efd_nextents; /* # of extents freed */ + uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_64_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_64_t; @@ -626,11 +626,11 @@ typedef struct xfs_efd_log_format_64 { * RUI/RUD (reverse mapping) log format definitions */ struct xfs_map_extent { - __uint64_t me_owner; - __uint64_t me_startblock; - __uint64_t me_startoff; - __uint32_t me_len; - __uint32_t me_flags; + uint64_t me_owner; + uint64_t me_startblock; + uint64_t me_startoff; + uint32_t me_len; + uint32_t me_flags; }; /* rmap me_flags: upper bits are flags, lower byte is type code */ @@ -659,10 +659,10 @@ struct xfs_map_extent { * size is given by rui_nextents. */ struct xfs_rui_log_format { - __uint16_t rui_type; /* rui log item type */ - __uint16_t rui_size; /* size of this item */ - __uint32_t rui_nextents; /* # extents to free */ - __uint64_t rui_id; /* rui identifier */ + uint16_t rui_type; /* rui log item type */ + uint16_t rui_size; /* size of this item */ + uint32_t rui_nextents; /* # extents to free */ + uint64_t rui_id; /* rui identifier */ struct xfs_map_extent rui_extents[]; /* array of extents to rmap */ }; @@ -680,19 +680,19 @@ xfs_rui_log_format_sizeof( * size is given by rud_nextents; */ struct xfs_rud_log_format { - __uint16_t rud_type; /* rud log item type */ - __uint16_t rud_size; /* size of this item */ - __uint32_t __pad; - __uint64_t rud_rui_id; /* id of corresponding rui */ + uint16_t rud_type; /* rud log item type */ + uint16_t rud_size; /* size of this item */ + uint32_t __pad; + uint64_t rud_rui_id; /* id of corresponding rui */ }; /* * CUI/CUD (refcount update) log format definitions */ struct xfs_phys_extent { - __uint64_t pe_startblock; - __uint32_t pe_len; - __uint32_t pe_flags; + uint64_t pe_startblock; + uint32_t pe_len; + uint32_t pe_flags; }; /* refcount pe_flags: upper bits are flags, lower byte is type code */ @@ -707,10 +707,10 @@ struct xfs_phys_extent { * size is given by cui_nextents. */ struct xfs_cui_log_format { - __uint16_t cui_type; /* cui log item type */ - __uint16_t cui_size; /* size of this item */ - __uint32_t cui_nextents; /* # extents to free */ - __uint64_t cui_id; /* cui identifier */ + uint16_t cui_type; /* cui log item type */ + uint16_t cui_size; /* size of this item */ + uint32_t cui_nextents; /* # extents to free */ + uint64_t cui_id; /* cui identifier */ struct xfs_phys_extent cui_extents[]; /* array of extents */ }; @@ -728,10 +728,10 @@ xfs_cui_log_format_sizeof( * size is given by cud_nextents; */ struct xfs_cud_log_format { - __uint16_t cud_type; /* cud log item type */ - __uint16_t cud_size; /* size of this item */ - __uint32_t __pad; - __uint64_t cud_cui_id; /* id of corresponding cui */ + uint16_t cud_type; /* cud log item type */ + uint16_t cud_size; /* size of this item */ + uint32_t __pad; + uint64_t cud_cui_id; /* id of corresponding cui */ }; /* @@ -755,10 +755,10 @@ struct xfs_cud_log_format { * size is given by bui_nextents. */ struct xfs_bui_log_format { - __uint16_t bui_type; /* bui log item type */ - __uint16_t bui_size; /* size of this item */ - __uint32_t bui_nextents; /* # extents to free */ - __uint64_t bui_id; /* bui identifier */ + uint16_t bui_type; /* bui log item type */ + uint16_t bui_size; /* size of this item */ + uint32_t bui_nextents; /* # extents to free */ + uint64_t bui_id; /* bui identifier */ struct xfs_map_extent bui_extents[]; /* array of extents to bmap */ }; @@ -776,10 +776,10 @@ xfs_bui_log_format_sizeof( * size is given by bud_nextents; */ struct xfs_bud_log_format { - __uint16_t bud_type; /* bud log item type */ - __uint16_t bud_size; /* size of this item */ - __uint32_t __pad; - __uint64_t bud_bui_id; /* id of corresponding bui */ + uint16_t bud_type; /* bud log item type */ + uint16_t bud_size; /* size of this item */ + uint32_t __pad; + uint64_t bud_bui_id; /* id of corresponding bui */ }; /* @@ -789,12 +789,12 @@ struct xfs_bud_log_format { * 32 bits : log_recovery code assumes that. */ typedef struct xfs_dq_logformat { - __uint16_t qlf_type; /* dquot log item type */ - __uint16_t qlf_size; /* size of this item */ + uint16_t qlf_type; /* dquot log item type */ + uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ - __int64_t qlf_blkno; /* blkno of dquot buffer */ - __int32_t qlf_len; /* len of dquot buffer */ - __uint32_t qlf_boffset; /* off of dquot in buffer */ + int64_t qlf_blkno; /* blkno of dquot buffer */ + int32_t qlf_len; /* len of dquot buffer */ + uint32_t qlf_boffset; /* off of dquot in buffer */ } xfs_dq_logformat_t; /* @@ -853,8 +853,8 @@ typedef struct xfs_qoff_logformat { * decoding can be done correctly. */ struct xfs_icreate_log { - __uint16_t icl_type; /* type of log format structure */ - __uint16_t icl_size; /* size of log format structure */ + uint16_t icl_type; /* type of log format structure */ + uint16_t icl_size; /* size of log format structure */ __be32 icl_ag; /* ag being allocated in */ __be32 icl_agbno; /* start block of inode range */ __be32 icl_count; /* number of inodes to initialise */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 29a01ec89dd0..66948a9fd486 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -26,7 +26,7 @@ #define XLOG_RHASH_SIZE 16 #define XLOG_RHASH_SHIFT 2 #define XLOG_RHASH(tid) \ - ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) + ((((uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) #define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 8eed51275bb3..2834574cb6e7 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -27,8 +27,8 @@ * they may need 64-bit accounting. Hence, 64-bit quota-counters, * and quota-limits. This is a waste in the common case, but hey ... */ -typedef __uint64_t xfs_qcnt_t; -typedef __uint16_t xfs_qwarncnt_t; +typedef uint64_t xfs_qcnt_t; +typedef uint16_t xfs_qwarncnt_t; /* * flags for q_flags field in the dquot. @@ -136,6 +136,8 @@ typedef __uint16_t xfs_qwarncnt_t; */ #define XFS_QMOPT_INHERIT 0x1000000 +#define XFS_QMOPT_NOLOCK 0x2000000 /* don't ilock during dqget */ + /* * flags to xfs_trans_mod_dquot. */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b177ef33cd4c..900ea231f9a3 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -784,14 +784,6 @@ xfs_refcount_merge_extents( } /* - * While we're adjusting the refcounts records of an extent, we have - * to keep an eye on the number of extents we're dirtying -- run too - * many in a single transaction and we'll exceed the transaction's - * reservation and crash the fs. Each record adds 12 bytes to the - * log (plus any key updates) so we'll conservatively assume 24 bytes - * per record. We must also leave space for btree splits on both ends - * of the range and space for the CUD and a new CUI. - * * XXX: This is a pretty hand-wavy estimate. The penalty for guessing * true incorrectly is a shutdown FS; the penalty for guessing false * incorrectly is more transaction rolls than might be necessary. @@ -813,8 +805,7 @@ xfs_refcount_still_have_space( */ if (cur->bc_private.a.priv.refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, - XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE, - XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE)) + XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; if (cur->bc_private.a.priv.refc.nr_ops == 0) @@ -822,7 +813,7 @@ xfs_refcount_still_have_space( else if (overhead > cur->bc_tp->t_log_res) return false; return cur->bc_tp->t_log_res - overhead > - cur->bc_private.a.priv.refc.nr_ops * 32; + cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -1076,8 +1067,7 @@ xfs_refcount_finish_one( blockcount); if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_REFCOUNT_FINISH_ONE, - XFS_RANDOM_REFCOUNT_FINISH_ONE)) + XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* @@ -1629,13 +1619,28 @@ xfs_refcount_recover_cow_leftovers( if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) return -EOPNOTSUPP; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + INIT_LIST_HEAD(&debris); + + /* + * In this first part, we use an empty transaction to gather up + * all the leftover CoW extents so that we can subsequently + * delete them. The empty transaction is used to avoid + * a buffer lock deadlock if there happens to be a loop in the + * refcountbt because we're allowed to re-grab a buffer that is + * already attached to our transaction. When we're done + * recording the CoW debris we cancel the (empty) transaction + * and everything goes away cleanly. + */ + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ - INIT_LIST_HEAD(&debris); memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); low.rc.rc_startblock = XFS_REFC_COW_START; @@ -1645,10 +1650,11 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_cursor; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); + xfs_trans_cancel(tp); /* Now iterate the list to free the leftovers */ - list_for_each_entry(rr, &debris, rr_list) { + list_for_each_entry_safe(rr, n, &debris, rr_list) { /* Set up transaction. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) @@ -1676,8 +1682,16 @@ xfs_refcount_recover_cow_leftovers( error = xfs_trans_commit(tp); if (error) goto out_free; + + list_del(&rr->rr_list); + kmem_free(rr); } + return error; +out_defer: + xfs_defer_cancel(&dfops); +out_trans: + xfs_trans_cancel(tp); out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { @@ -1688,11 +1702,6 @@ out_free: out_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_buf_relse(agbp); - goto out_free; - -out_defer: - xfs_defer_cancel(&dfops); - xfs_trans_cancel(tp); - goto out_free; + xfs_trans_brelse(tp, agbp); + goto out_trans; } diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 098dc668ab2c..eafb9d1f3b37 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, xfs_agnumber_t agno); +/* + * While we're adjusting the refcounts records of an extent, we have + * to keep an eye on the number of extents we're dirtying -- run too + * many in a single transaction and we'll exceed the transaction's + * reservation and crash the fs. Each record adds 12 bytes to the + * log (plus any key updates) so we'll conservatively assume 32 bytes + * per record. We must also leave space for btree splits on both ends + * of the range and space for the CUD and a new CUI. + */ +#define XFS_REFCOUNT_ITEM_OVERHEAD 32 + +static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) +{ + return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; +} + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 50add5272807..3c59dd3d58d7 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -202,7 +202,7 @@ xfs_refcountbt_init_ptr_from_cur( ptr->s = agf->agf_refcount_root; } -STATIC __int64_t +STATIC int64_t xfs_refcountbt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) @@ -210,16 +210,16 @@ xfs_refcountbt_key_diff( struct xfs_refcount_irec *rec = &cur->bc_rec.rc; struct xfs_refcount_key *kp = &key->refc; - return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; + return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; } -STATIC __int64_t +STATIC int64_t xfs_refcountbt_diff_two_keys( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) - + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - be32_to_cpu(k2->refc.rc_startblock); } @@ -285,7 +285,6 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .verify_write = xfs_refcountbt_write_verify, }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_refcountbt_keys_inorder( struct xfs_btree_cur *cur, @@ -306,7 +305,6 @@ xfs_refcountbt_recs_inorder( be32_to_cpu(r1->refc.rc_blockcount) <= be32_to_cpu(r2->refc.rc_startblock); } -#endif static const struct xfs_btree_ops xfs_refcountbt_ops = { .rec_len = sizeof(struct xfs_refcount_rec), @@ -325,10 +323,8 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { .key_diff = xfs_refcountbt_key_diff, .buf_ops = &xfs_refcountbt_buf_ops, .diff_two_keys = xfs_refcountbt_diff_two_keys, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_refcountbt_keys_inorder, .recs_inorder = xfs_refcountbt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 06cfb93c2ef9..55c88a732690 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -179,7 +179,8 @@ done: return error; } -static int +/* Convert an internal btree record to an rmap record. */ +int xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, struct xfs_rmap_irec *irec) @@ -2061,7 +2062,7 @@ int xfs_rmap_finish_one( struct xfs_trans *tp, enum xfs_rmap_intent_type type, - __uint64_t owner, + uint64_t owner, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, @@ -2086,8 +2087,7 @@ xfs_rmap_finish_one( startoff, blockcount, state); if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_RMAP_FINISH_ONE, - XFS_RANDOM_RMAP_FINISH_ONE)) + XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; /* @@ -2182,7 +2182,7 @@ __xfs_rmap_add( struct xfs_mount *mp, struct xfs_defer_ops *dfops, enum xfs_rmap_intent_type type, - __uint64_t owner, + uint64_t owner, int whichfork, struct xfs_bmbt_irec *bmap) { @@ -2266,7 +2266,7 @@ xfs_rmap_alloc_extent( xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - __uint64_t owner) + uint64_t owner) { struct xfs_bmbt_irec bmap; @@ -2290,7 +2290,7 @@ xfs_rmap_free_extent( xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - __uint64_t owner) + uint64_t owner) { struct xfs_bmbt_irec bmap; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 98f908fea103..466ede637080 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -179,7 +179,7 @@ enum xfs_rmap_intent_type { struct xfs_rmap_intent { struct list_head ri_list; enum xfs_rmap_intent_type ri_type; - __uint64_t ri_owner; + uint64_t ri_owner; int ri_whichfork; struct xfs_bmbt_irec ri_bmap; }; @@ -196,15 +196,15 @@ int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *imap); int xfs_rmap_alloc_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - __uint64_t owner); + uint64_t owner); int xfs_rmap_free_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - __uint64_t owner); + uint64_t owner); void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, - __uint64_t owner, int whichfork, xfs_fileoff_t startoff, + uint64_t owner, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); @@ -216,5 +216,8 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_compare(const struct xfs_rmap_irec *a, const struct xfs_rmap_irec *b); +union xfs_btree_rec; +int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, + struct xfs_rmap_irec *irec); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 74e5a54bc428..9d9c9192584c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -199,7 +199,7 @@ xfs_rmapbt_init_high_key_from_rec( union xfs_btree_key *key, union xfs_btree_rec *rec) { - __uint64_t off; + uint64_t off; int adj; adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; @@ -241,7 +241,7 @@ xfs_rmapbt_init_ptr_from_cur( ptr->s = agf->agf_roots[cur->bc_btnum]; } -STATIC __int64_t +STATIC int64_t xfs_rmapbt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) @@ -249,9 +249,9 @@ xfs_rmapbt_key_diff( struct xfs_rmap_irec *rec = &cur->bc_rec.r; struct xfs_rmap_key *kp = &key->rmap; __u64 x, y; - __int64_t d; + int64_t d; - d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; + d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; if (d) return d; @@ -271,7 +271,7 @@ xfs_rmapbt_key_diff( return 0; } -STATIC __int64_t +STATIC int64_t xfs_rmapbt_diff_two_keys( struct xfs_btree_cur *cur, union xfs_btree_key *k1, @@ -279,10 +279,10 @@ xfs_rmapbt_diff_two_keys( { struct xfs_rmap_key *kp1 = &k1->rmap; struct xfs_rmap_key *kp2 = &k2->rmap; - __int64_t d; + int64_t d; __u64 x, y; - d = (__int64_t)be32_to_cpu(kp1->rm_startblock) - + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - be32_to_cpu(kp2->rm_startblock); if (d) return d; @@ -377,17 +377,16 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .verify_write = xfs_rmapbt_write_verify, }; -#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_rmapbt_keys_inorder( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - __uint32_t x; - __uint32_t y; - __uint64_t a; - __uint64_t b; + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; x = be32_to_cpu(k1->rmap.rm_startblock); y = be32_to_cpu(k2->rmap.rm_startblock); @@ -414,10 +413,10 @@ xfs_rmapbt_recs_inorder( union xfs_btree_rec *r1, union xfs_btree_rec *r2) { - __uint32_t x; - __uint32_t y; - __uint64_t a; - __uint64_t b; + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; x = be32_to_cpu(r1->rmap.rm_startblock); y = be32_to_cpu(r2->rmap.rm_startblock); @@ -437,7 +436,6 @@ xfs_rmapbt_recs_inorder( return 1; return 0; } -#endif /* DEBUG */ static const struct xfs_btree_ops xfs_rmapbt_ops = { .rec_len = sizeof(struct xfs_rmap_rec), @@ -456,10 +454,8 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .key_diff = xfs_rmapbt_key_diff, .buf_ops = &xfs_rmapbt_buf_ops, .diff_two_keys = xfs_rmapbt_diff_two_keys, -#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, -#endif }; /* diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e47b99e59f60..5d4e43ef4eea 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -static int +int xfs_rtbuf_get( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ @@ -1011,7 +1011,7 @@ xfs_rtfree_extent( mp->m_sb.sb_rextents) { if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; - *(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; + *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } return 0; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 584ec896a533..9b5aae2bcc0b 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -448,7 +448,7 @@ xfs_sb_quota_to_disk( struct xfs_dsb *to, struct xfs_sb *from) { - __uint16_t qflags = from->sb_qflags; + uint16_t qflags = from->sb_qflags; to->sb_uquotino = cpu_to_be64(from->sb_uquotino); if (xfs_sb_version_has_pquotino(from)) { @@ -756,7 +756,7 @@ xfs_sb_mount_common( mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; mp->m_bsize = XFS_FSB_TO_BB(mp, 1); - mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + mp->m_ialloc_inos = (int)MAX((uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 2e2c6716b623..c484877129a0 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -114,7 +114,7 @@ xfs_symlink_verify( if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) return false; if (be32_to_cpu(dsl->sl_offset) + - be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN) return false; if (dsl->sl_owner == 0) return false; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index b456cca1bfb2..6bd916bd35e2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -477,14 +477,14 @@ xfs_calc_mkdir_reservation( /* * Making a new symplink is the same as creating a new file, but * with the added blocks for remote symlink data which can be up to 1kB in - * length (MAXPATHLEN). + * length (XFS_SYMLINK_MAXLEN). */ STATIC uint xfs_calc_symlink_reservation( struct xfs_mount *mp) { return xfs_calc_create_reservation(mp) + - xfs_calc_buf_res(1, MAXPATHLEN); + xfs_calc_buf_res(1, XFS_SYMLINK_MAXLEN); } /* diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 717909f2f7b7..0220159bd463 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -18,34 +18,34 @@ #ifndef __XFS_TYPES_H__ #define __XFS_TYPES_H__ -typedef __uint32_t prid_t; /* project ID */ +typedef uint32_t prid_t; /* project ID */ -typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ -typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ -typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ -typedef __uint32_t xfs_agnumber_t; /* allocation group number */ -typedef __int32_t xfs_extnum_t; /* # of extents in a file */ -typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */ -typedef __int64_t xfs_fsize_t; /* bytes in a file */ -typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ +typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ +typedef uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef uint32_t xfs_agnumber_t; /* allocation group number */ +typedef int32_t xfs_extnum_t; /* # of extents in a file */ +typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef int64_t xfs_fsize_t; /* bytes in a file */ +typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ -typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */ -typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */ +typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef int32_t xfs_rtword_t; /* word type for bitmap manipulations */ -typedef __int64_t xfs_lsn_t; /* log sequence number */ -typedef __int32_t xfs_tid_t; /* transaction identifier */ +typedef int64_t xfs_lsn_t; /* log sequence number */ +typedef int32_t xfs_tid_t; /* transaction identifier */ -typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ -typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ +typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ +typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ -typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ -typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ -typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ -typedef __uint64_t xfs_fileoff_t; /* block number in a file */ -typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ +typedef uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ +typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ +typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ +typedef uint64_t xfs_fileoff_t; /* block number in a file */ +typedef uint64_t xfs_filblks_t; /* number of blocks in a file */ -typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ -typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ +typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +typedef int64_t xfs_sfiloff_t; /* signed block number in a file */ /* * Null values for the types. @@ -125,7 +125,7 @@ struct xfs_name { * uid_t and gid_t are hard-coded to 32 bits in the inode. * Hence, an 'id' in a dquot is 32 bits.. */ -typedef __uint32_t xfs_dqid_t; +typedef uint32_t xfs_dqid_t; /* * Constants for bit manipulations. diff --git a/fs/xfs/uuid.c b/fs/xfs/uuid.c deleted file mode 100644 index b83f76b6d410..000000000000 --- a/fs/xfs/uuid.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <xfs.h> - -/* IRIX interpretation of an uuid_t */ -typedef struct { - __be32 uu_timelow; - __be16 uu_timemid; - __be16 uu_timehi; - __be16 uu_clockseq; - __be16 uu_node[3]; -} xfs_uu_t; - -/* - * uuid_getnodeuniq - obtain the node unique fields of a UUID. - * - * This is not in any way a standard or condoned UUID function; - * it just something that's needed for user-level file handles. - */ -void -uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) -{ - xfs_uu_t *uup = (xfs_uu_t *)uuid; - - fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) | - be16_to_cpu(uup->uu_timemid); - fsid[1] = be32_to_cpu(uup->uu_timelow); -} - -int -uuid_is_nil(uuid_t *uuid) -{ - int i; - char *cp = (char *)uuid; - - if (uuid == NULL) - return 0; - /* implied check of version number here... */ - for (i = 0; i < sizeof *uuid; i++) - if (*cp++) return 0; /* not nil */ - return 1; /* is nil */ -} - -int -uuid_equal(uuid_t *uuid1, uuid_t *uuid2) -{ - return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; -} diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h deleted file mode 100644 index 104db0f3bed6..000000000000 --- a/fs/xfs/uuid.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_UUID_H__ -#define __XFS_SUPPORT_UUID_H__ - -typedef struct { - unsigned char __u_bits[16]; -} uuid_t; - -extern int uuid_is_nil(uuid_t *uuid); -extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); -extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); - -static inline void -uuid_copy(uuid_t *dst, uuid_t *src) -{ - memcpy(dst, src, sizeof(uuid_t)); -} - -#endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index a742c47f7d5a..80cd0fd86783 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -24,6 +24,10 @@ #define XFS_BUF_LOCK_TRACKING 1 #endif +#ifdef CONFIG_XFS_ASSERT_FATAL +#define XFS_ASSERT_FATAL 1 +#endif + #ifdef CONFIG_XFS_WARN #define XFS_WARN 1 #endif diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b468e041f207..7034e17535de 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -170,8 +170,8 @@ xfs_get_acl(struct inode *inode, int type) return acl; } -STATIC int -__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int +__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct xfs_inode *ip = XFS_I(inode); unsigned char *ea_name; @@ -268,5 +268,5 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) } set_acl: - return __xfs_set_acl(inode, type, acl); + return __xfs_set_acl(inode, acl, type); } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 286fa89217f5..04327318ef67 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -24,6 +24,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 09af0f7cd55e..6bf120bb1a17 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -276,7 +276,7 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - int error = ioend->io_bio->bi_error; + int error; /* * Just clean up the in-memory strutures if the fs has been shut down. @@ -289,6 +289,7 @@ xfs_end_io( /* * Clean up any COW blocks on an I/O error. */ + error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { switch (ioend->io_type) { case XFS_IO_COW: @@ -332,7 +333,7 @@ xfs_end_bio( else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); else - xfs_destroy_ioend(ioend, bio->bi_error); + xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } STATIC int @@ -500,11 +501,12 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_bio->bi_error = status; + ioend->io_bio->bi_status = errno_to_blk_status(status); bio_endio(ioend->io_bio); return status; } + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); return 0; } @@ -564,6 +566,7 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); ioend->io_bio = new; } @@ -836,7 +839,7 @@ xfs_writepage_map( struct inode *inode, struct page *page, loff_t offset, - __uint64_t end_offset) + uint64_t end_offset) { LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; @@ -991,7 +994,7 @@ xfs_do_writepage( struct xfs_writepage_ctx *wpc = data; struct inode *inode = page->mapping->host; loff_t offset; - __uint64_t end_offset; + uint64_t end_offset; pgoff_t end_index; trace_xfs_writepage(inode, page, 0, 0); @@ -1316,9 +1319,12 @@ xfs_vm_bmap( * The swap code (ab-)uses ->bmap to get a block mapping and then * bypasseÑ• the file system for actual I/O. We really can't allow * that on reflinks inodes, so we have to skip out here. And yes, - * 0 is the magic code for a bmap error.. + * 0 is the magic code for a bmap error. + * + * Since we don't pass back blockdev info, we can't return bmap + * information for rt files either. */ - if (xfs_is_reflink_inode(ip)) + if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; filemap_write_and_wait(mapping); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index d14691aa02b4..5d5a5e277f35 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -117,6 +117,7 @@ typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); typedef struct xfs_attr_list_context { + struct xfs_trans *tp; struct xfs_inode *dp; /* inode */ struct attrlist_cursor_kern *cursor; /* position in list */ char *alist; /* output buffer */ @@ -140,8 +141,10 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); +int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); +int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, unsigned char *value, int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 97c45b6eb91e..545eca508d42 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -230,7 +230,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, + error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if ((error != 0) && (error != -EFSCORRUPTED)) return error; @@ -242,7 +242,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) case XFS_DA_NODE_MAGIC: case XFS_DA3_NODE_MAGIC: trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); bp = NULL; break; case XFS_ATTR_LEAF_MAGIC: @@ -254,18 +254,18 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (cursor->hashval > be32_to_cpu( entries[leafhdr.count - 1].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); bp = NULL; } else if (cursor->hashval <= be32_to_cpu( entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); bp = NULL; } break; default: trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); bp = NULL; } } @@ -279,9 +279,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - __uint16_t magic; + uint16_t magic; - error = xfs_da3_node_read(NULL, dp, + error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) @@ -297,7 +297,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) XFS_ERRLEVEL_LOW, context->dp->i_mount, node); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); return -EFSCORRUPTED; } @@ -313,10 +313,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) } } if (i == nodehdr.count) { - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); return 0; } - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); } } ASSERT(bp != NULL); @@ -333,12 +333,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (context->seen_enough || leafhdr.forw == 0) break; cursor->blkno = leafhdr.forw; - xfs_trans_brelse(NULL, bp); - error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp); + xfs_trans_brelse(context->tp, bp); + error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, -1, &bp); if (error) return error; } - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); return 0; } @@ -448,16 +448,34 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(context->tp, context->dp, 0, -1, &bp); if (error) return error; xfs_attr3_leaf_list_int(bp, context); - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(context->tp, bp); return 0; } int +xfs_attr_list_int_ilocked( + struct xfs_attr_list_context *context) +{ + struct xfs_inode *dp = context->dp; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp)) + return 0; + else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + return xfs_attr_shortform_list(context); + else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + return xfs_attr_leaf_list(context); + return xfs_attr_node_list(context); +} + +int xfs_attr_list_int( xfs_attr_list_context_t *context) { @@ -470,19 +488,8 @@ xfs_attr_list_int( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - /* - * Decide on what work routines to call based on the inode size. - */ lock_mode = xfs_ilock_attr_map_shared(dp); - if (!xfs_inode_hasattr(dp)) { - error = 0; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = xfs_attr_shortform_list(context); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_list(context); - } else { - error = xfs_attr_node_list(context); - } + error = xfs_attr_list_int_ilocked(context); xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index d419d23fa214..88073910fa5d 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -396,6 +396,7 @@ xfs_bui_recover( struct xfs_map_extent *bmap; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; + xfs_filblks_t count; bool op_ok; struct xfs_bud_log_item *budp; enum xfs_bmap_intent_type type; @@ -404,6 +405,7 @@ xfs_bui_recover( struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_defer_ops dfops; + struct xfs_bmbt_irec irec; xfs_fsblock_t firstfsb; ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); @@ -481,13 +483,24 @@ xfs_bui_recover( } xfs_trans_ijoin(tp, ip, 0); + count = bmap->me_len; error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, ip, whichfork, bmap->me_startoff, - bmap->me_startblock, bmap->me_len, - state); + bmap->me_startblock, &count, state); if (error) goto err_dfops; + if (count > 0) { + ASSERT(type == XFS_BMAP_UNMAP); + irec.br_startblock = bmap->me_startblock; + irec.br_blockcount = count; + irec.br_startoff = bmap->me_startoff; + irec.br_state = state; + error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); + if (error) + goto err_dfops; + } + /* Finish transaction, free inodes. */ error = xfs_defer_finish(&tp, &dfops, NULL); if (error) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2b954308a1d6..93e955262d07 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -219,20 +219,24 @@ xfs_bmap_eof( */ /* - * Count leaf blocks given a range of extent records. + * Count leaf blocks given a range of extent records. Delayed allocation + * extents are not counted towards the totals. */ STATIC void xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count) + struct xfs_ifork *ifp, + xfs_extnum_t *numrecs, + xfs_filblks_t *count) { - int b; - - for (b = 0; b < numrecs; b++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); - *count += xfs_bmbt_get_blockcount(frp); + xfs_extnum_t i; + xfs_extnum_t nr_exts = xfs_iext_count(ifp); + + for (i = 0; i < nr_exts; i++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(frp))) { + (*numrecs)++; + *count += xfs_bmbt_get_blockcount(frp); + } } } @@ -245,7 +249,7 @@ xfs_bmap_disk_count_leaves( struct xfs_mount *mp, struct xfs_btree_block *block, int numrecs, - int *count) + xfs_filblks_t *count) { int b; xfs_bmbt_rec_t *frp; @@ -260,17 +264,18 @@ xfs_bmap_disk_count_leaves( * Recursively walks each level of a btree * to count total fsblocks in use. */ -STATIC int /* error */ +STATIC int xfs_bmap_count_tree( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fsblock_t blockno, /* file system block number */ - int levelin, /* level in btree */ - int *count) /* Count of blocks */ + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_ifork *ifp, + xfs_fsblock_t blockno, + int levelin, + xfs_extnum_t *nextents, + xfs_filblks_t *count) { int error; - xfs_buf_t *bp, *nbp; + struct xfs_buf *bp, *nbp; int level = levelin; __be64 *pp; xfs_fsblock_t bno = blockno; @@ -303,8 +308,9 @@ xfs_bmap_count_tree( /* Dive to the next level */ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - if (unlikely((error = - xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, nextents, + count); + if (error) { xfs_trans_brelse(tp, bp); XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", XFS_ERRLEVEL_LOW, mp); @@ -316,6 +322,7 @@ xfs_bmap_count_tree( for (;;) { nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); numrecs = be16_to_cpu(block->bb_numrecs); + (*nextents) += numrecs; xfs_bmap_disk_count_leaves(mp, block, numrecs, count); xfs_trans_brelse(tp, bp); if (nextbno == NULLFSBLOCK) @@ -334,46 +341,64 @@ xfs_bmap_count_tree( } /* - * Count fsblocks of the given fork. + * Count fsblocks of the given fork. Delayed allocation extents are + * not counted towards the totals. */ -static int /* error */ +int xfs_bmap_count_blocks( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - int *count) /* out: count of blocks */ + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_extnum_t *nextents, + xfs_filblks_t *count) { + struct xfs_mount *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ struct xfs_btree_block *block; /* current btree block */ + struct xfs_ifork *ifp; /* fork structure */ xfs_fsblock_t bno; /* block # of "block" */ - xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ + int error; bno = NULLFSBLOCK; mp = ip->i_mount; + *nextents = 0; + *count = 0; ifp = XFS_IFORK_PTR(ip, whichfork); - if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count); + if (!ifp) return 0; - } - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, - mp); - return -EFSCORRUPTED; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_EXTENTS: + xfs_bmap_count_leaves(ifp, nextents, count); + return 0; + case XFS_DINODE_FMT_BTREE: + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLFSBLOCK); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, + nextents, count); + if (error) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + return 0; } return 0; @@ -389,11 +414,11 @@ xfs_getbmapx_fix_eof_hole( struct getbmapx *out, /* output structure */ int prealloced, /* this is a file with * preallocated data space */ - __int64_t end, /* last block requested */ + int64_t end, /* last block requested */ xfs_fsblock_t startblock, bool moretocome) { - __int64_t fixlen; + int64_t fixlen; xfs_mount_t *mp; /* file system mount point */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_extnum_t lastx; /* last extent pointer */ @@ -455,8 +480,8 @@ xfs_getbmap_adjust_shared( agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); - error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount, - &ebno, &elen, true); + error = xfs_reflink_find_shared(mp, NULL, agno, agbno, + map->br_blockcount, &ebno, &elen, true); if (error) return error; @@ -514,9 +539,9 @@ xfs_getbmap( xfs_bmap_format_t formatter, /* format to user */ void *arg) /* formatter arg */ { - __int64_t bmvend; /* last block requested */ + int64_t bmvend; /* last block requested */ int error = 0; /* return value */ - __int64_t fixlen; /* length for -1 case */ + int64_t fixlen; /* length for -1 case */ int i; /* extent number */ int lock; /* lock state */ xfs_bmbt_irec_t *map; /* buffer for user's data */ @@ -582,9 +607,13 @@ xfs_getbmap( } break; default: + /* Local format data forks report no extents. */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + bmv->bmv_entries = 0; + return 0; + } if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) return -EINVAL; if (xfs_get_extsz_hint(ip) || @@ -601,7 +630,7 @@ xfs_getbmap( if (bmv->bmv_length == -1) { fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); bmv->bmv_length = - max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + max_t(int64_t, fixlen - bmv->bmv_offset, 0); } else if (bmv->bmv_length == 0) { bmv->bmv_entries = 0; return 0; @@ -712,7 +741,7 @@ xfs_getbmap( * extents. */ if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); if (map[i].br_startblock == HOLESTARTBLOCK && @@ -738,7 +767,7 @@ xfs_getbmap( out[cur_ext].bmv_offset + out[cur_ext].bmv_length; bmv->bmv_length = - max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + max_t(int64_t, 0, bmvend - bmv->bmv_offset); /* * In case we don't want to return the hole, @@ -1613,7 +1642,7 @@ xfs_swap_extents_check_format( * extent format... */ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(ip) && + if (XFS_IFORK_Q(ip) && XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) return -EINVAL; if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= @@ -1623,7 +1652,7 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(tip) && + if (XFS_IFORK_Q(tip) && XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return -EINVAL; if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= @@ -1672,7 +1701,7 @@ xfs_swap_extent_rmap( xfs_filblks_t ilen; xfs_filblks_t rlen; int nimaps; - __uint64_t tip_flags2; + uint64_t tip_flags2; /* * If the source file has shared blocks, we must flag the donor @@ -1785,10 +1814,11 @@ xfs_swap_extent_forks( int *target_log_flags) { struct xfs_ifork tempifp, *ifp, *tifp; - int aforkblks = 0; - int taforkblks = 0; + xfs_filblks_t aforkblks = 0; + xfs_filblks_t taforkblks = 0; + xfs_extnum_t junk; xfs_extnum_t nextents; - __uint64_t tmp; + uint64_t tmp; int error; /* @@ -1796,14 +1826,14 @@ xfs_swap_extent_forks( */ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) return error; @@ -1846,15 +1876,15 @@ xfs_swap_extent_forks( /* * Fix the on-disk inode values */ - tmp = (__uint64_t)ip->i_d.di_nblocks; + tmp = (uint64_t)ip->i_d.di_nblocks; ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; - tmp = (__uint64_t) ip->i_d.di_nextents; + tmp = (uint64_t) ip->i_d.di_nextents; ip->i_d.di_nextents = tip->i_d.di_nextents; tip->i_d.di_nextents = tmp; - tmp = (__uint64_t) ip->i_d.di_format; + tmp = (uint64_t) ip->i_d.di_format; ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; @@ -1923,7 +1953,7 @@ xfs_swap_extents( int error = 0; int lock_flags; struct xfs_ifork *cowfp; - __uint64_t f; + uint64_t f; int resblks; /* diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 135d8267e284..0cede1043571 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -70,4 +70,8 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_extnum_t *nextents, + xfs_filblks_t *count); + #endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 62fa39276a24..72f038492ba8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -97,12 +97,16 @@ static inline void xfs_buf_ioacct_inc( struct xfs_buf *bp) { - if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + if (bp->b_flags & XBF_NO_IOACCT) return; ASSERT(bp->b_flags & XBF_ASYNC); - bp->b_flags |= _XBF_IN_FLIGHT; - percpu_counter_inc(&bp->b_target->bt_io_count); + spin_lock(&bp->b_lock); + if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { + bp->b_state |= XFS_BSTATE_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); + } + spin_unlock(&bp->b_lock); } /* @@ -110,14 +114,24 @@ xfs_buf_ioacct_inc( * freed and unaccount from the buftarg. */ static inline void -xfs_buf_ioacct_dec( +__xfs_buf_ioacct_dec( struct xfs_buf *bp) { - if (!(bp->b_flags & _XBF_IN_FLIGHT)) - return; + lockdep_assert_held(&bp->b_lock); - bp->b_flags &= ~_XBF_IN_FLIGHT; - percpu_counter_dec(&bp->b_target->bt_io_count); + if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { + bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); + } +} + +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + spin_unlock(&bp->b_lock); } /* @@ -149,9 +163,9 @@ xfs_buf_stale( * unaccounted (released to LRU) before that occurs. Drop in-flight * status now to preserve accounting consistency. */ - xfs_buf_ioacct_dec(bp); - spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) @@ -979,12 +993,12 @@ xfs_buf_rele( * ensures the decrement occurs only once per-buf. */ if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); goto out_unlock; } /* the last reference has been dropped ... */ - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* * If the buffer is added to the LRU take a new reference to the @@ -1180,7 +1194,7 @@ xfs_buf_ioerror_alert( { xfs_alert(bp->b_target->bt_mount, "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", - (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); + (uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); } int @@ -1213,8 +1227,11 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) - cmpxchg(&bp->b_io_error, 0, bio->bi_error); + if (bio->bi_status) { + int error = blk_status_to_errno(bio->bi_status); + + cmpxchg(&bp->b_io_error, 0, error); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -2033,6 +2050,66 @@ xfs_buf_delwri_submit( return error; } +/* + * Push a single buffer on a delwri queue. + * + * The purpose of this function is to submit a single buffer of a delwri queue + * and return with the buffer still on the original queue. The waiting delwri + * buffer submission infrastructure guarantees transfer of the delwri queue + * buffer reference to a temporary wait list. We reuse this infrastructure to + * transfer the buffer back to the original queue. + * + * Note the buffer transitions from the queued state, to the submitted and wait + * listed state and back to the queued state during this call. The buffer + * locking and queue management logic between _delwri_pushbuf() and + * _delwri_queue() guarantee that the buffer cannot be queued to another list + * before returning. + */ +int +xfs_buf_delwri_pushbuf( + struct xfs_buf *bp, + struct list_head *buffer_list) +{ + LIST_HEAD (submit_list); + int error; + + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + + trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); + + /* + * Isolate the buffer to a new local list so we can submit it for I/O + * independently from the rest of the original list. + */ + xfs_buf_lock(bp); + list_move(&bp->b_list, &submit_list); + xfs_buf_unlock(bp); + + /* + * Delwri submission clears the DELWRI_Q buffer flag and returns with + * the buffer on the wait list with an associated reference. Rather than + * bounce the buffer from a local wait list back to the original list + * after I/O completion, reuse the original list as the wait list. + */ + xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); + + /* + * The buffer is now under I/O and wait listed as during typical delwri + * submission. Lock the buffer to wait for I/O completion. Rather than + * remove the buffer from the wait list and release the reference, we + * want to return with the buffer queued to the original list. The + * buffer already sits on the original list with a wait list reference, + * however. If we let the queue inherit that wait list reference, all we + * need to do is reset the DELWRI_Q flag. + */ + xfs_buf_lock(bp); + error = bp->b_error; + bp->b_flags |= _XBF_DELWRI_Q; + xfs_buf_unlock(bp); + + return error; +} + int __init xfs_buf_init(void) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8d1d44f87ce9..20721261dae5 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -63,7 +63,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -84,14 +83,14 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_IN_FLIGHT, "IN_FLIGHT" } + { _XBF_COMPOUND, "COMPOUND" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ +#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ /* * The xfs_buftarg contains 2 notions of "sector size" - @@ -333,6 +332,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); +extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 0306168af332..f6a8422e9562 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -636,20 +636,23 @@ xfs_buf_item_unlock( /* * Clean buffers, by definition, cannot be in the AIL. However, aborted - * buffers may be dirty and hence in the AIL. Therefore if we are - * aborting a buffer and we've just taken the last refernce away, we - * have to check if it is in the AIL before freeing it. We need to free - * it in this case, because an aborted transaction has already shut the - * filesystem down and this is the last chance we will have to do so. + * buffers may be in the AIL regardless of dirty state. An aborted + * transaction that invalidates a buffer already in the AIL may have + * marked it stale and cleared the dirty state, for example. + * + * Therefore if we are aborting a buffer and we've just taken the last + * reference away, we have to check if it is in the AIL before freeing + * it. We need to free it in this case, because an aborted transaction + * has already shut the filesystem down and this is the last chance we + * will have to do so. */ if (atomic_dec_and_test(&bip->bli_refcount)) { - if (clean) - xfs_buf_item_relse(bp); - else if (aborted) { + if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } + } else if (clean) + xfs_buf_item_relse(bp); } if (!(flags & XFS_BLI_HOLD)) diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 20b7a5c6eb2f..ba2638d37031 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -44,7 +44,7 @@ static unsigned char xfs_dir3_filetype_table[] = { static unsigned char xfs_dir3_get_dtype( struct xfs_mount *mp, - __uint8_t filetype) + uint8_t filetype) { if (!xfs_sb_version_hasftype(&mp->m_sb)) return DT_UNKNOWN; @@ -117,7 +117,7 @@ xfs_dir2_sf_getdents( */ sfep = xfs_dir2_sf_firstentry(sfp); for (i = 0; i < sfp->count; i++) { - __uint8_t filetype; + uint8_t filetype; off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, xfs_dir2_sf_get_offset(sfep)); @@ -170,7 +170,7 @@ xfs_dir2_block_getdents( return 0; lock_mode = xfs_ilock_data_map_shared(dp); - error = xfs_dir3_block_read(NULL, dp, &bp); + error = xfs_dir3_block_read(args->trans, dp, &bp); xfs_iunlock(dp, lock_mode); if (error) return error; @@ -194,7 +194,7 @@ xfs_dir2_block_getdents( * Each object is a real entry (dep) or an unused one (dup). */ while (ptr < endptr) { - __uint8_t filetype; + uint8_t filetype; dup = (xfs_dir2_data_unused_t *)ptr; /* @@ -228,7 +228,7 @@ xfs_dir2_block_getdents( if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) { - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(args->trans, bp); return 0; } } @@ -239,218 +239,104 @@ xfs_dir2_block_getdents( */ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(args->trans, bp); return 0; } -struct xfs_dir2_leaf_map_info { - xfs_extlen_t map_blocks; /* number of fsbs in map */ - xfs_dablk_t map_off; /* last mapped file offset */ - int map_size; /* total entries in *map */ - int map_valid; /* valid entries in *map */ - int nmap; /* mappings to ask xfs_bmapi */ - xfs_dir2_db_t curdb; /* db for current block */ - int ra_current; /* number of read-ahead blks */ - int ra_index; /* *map index for read-ahead */ - int ra_offset; /* map entry offset for ra */ - int ra_want; /* readahead count wanted */ - struct xfs_bmbt_irec map[]; /* map vector for blocks */ -}; - +/* + * Read a directory block and initiate readahead for blocks beyond that. + * We maintain a sliding readahead window of the remaining space in the + * buffer rounded up to the nearest block. + */ STATIC int xfs_dir2_leaf_readbuf( struct xfs_da_args *args, size_t bufsize, - struct xfs_dir2_leaf_map_info *mip, - xfs_dir2_off_t *curoff, - struct xfs_buf **bpp, - bool trim_map) + xfs_dir2_off_t *cur_off, + xfs_dablk_t *ra_blk, + struct xfs_buf **bpp) { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; - struct xfs_bmbt_irec *map = mip->map; + struct xfs_da_geometry *geo = args->geo; + struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_bmbt_irec map; struct blk_plug plug; + xfs_dir2_off_t new_off; + xfs_dablk_t next_ra; + xfs_dablk_t map_off; + xfs_dablk_t last_da; + xfs_extnum_t idx; + int ra_want; int error = 0; - int length; - int i; - int j; - struct xfs_da_geometry *geo = args->geo; - - /* - * If the caller just finished processing a buffer, it will tell us - * we need to trim that block out of the mapping now it is done. - */ - if (trim_map) { - mip->map_blocks -= geo->fsbcount; - /* - * Loop to get rid of the extents for the - * directory block. - */ - for (i = geo->fsbcount; i > 0; ) { - j = min_t(int, map->br_blockcount, i); - map->br_blockcount -= j; - map->br_startblock += j; - map->br_startoff += j; - /* - * If mapping is done, pitch it from - * the table. - */ - if (!map->br_blockcount && --mip->map_valid) - memmove(&map[0], &map[1], - sizeof(map[0]) * mip->map_valid); - i -= j; - } - } - /* - * Recalculate the readahead blocks wanted. - */ - mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1; - ASSERT(mip->ra_want >= 0); - - /* - * If we don't have as many as we want, and we haven't - * run out of data blocks, get some more mappings. - */ - if (1 + mip->ra_want > mip->map_blocks && - mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) { - /* - * Get more bmaps, fill in after the ones - * we already have in the table. - */ - mip->nmap = mip->map_size - mip->map_valid; - error = xfs_bmapi_read(dp, mip->map_off, - xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) - - mip->map_off, - &map[mip->map_valid], &mip->nmap, 0); - - /* - * Don't know if we should ignore this or try to return an - * error. The trouble with returning errors is that readdir - * will just stop without actually passing the error through. - */ + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK); if (error) - goto out; /* XXX */ - - /* - * If we got all the mappings we asked for, set the final map - * offset based on the last bmap value received. Otherwise, - * we've reached the end. - */ - if (mip->nmap == mip->map_size - mip->map_valid) { - i = mip->map_valid + mip->nmap - 1; - mip->map_off = map[i].br_startoff + map[i].br_blockcount; - } else - mip->map_off = xfs_dir2_byte_to_da(geo, - XFS_DIR2_LEAF_OFFSET); - - /* - * Look for holes in the mapping, and eliminate them. Count up - * the valid blocks. - */ - for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { - if (map[i].br_startblock == HOLESTARTBLOCK) { - mip->nmap--; - length = mip->map_valid + mip->nmap - i; - if (length) - memmove(&map[i], &map[i + 1], - sizeof(map[i]) * length); - } else { - mip->map_blocks += map[i].br_blockcount; - i++; - } - } - mip->map_valid += mip->nmap; + goto out; } /* - * No valid mappings, so no more data blocks. + * Look for mapped directory blocks at or above the current offset. + * Truncate down to the nearest directory block to start the scanning + * operation. */ - if (!mip->map_valid) { - *curoff = xfs_dir2_da_to_byte(geo, mip->map_off); + last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); + map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off)); + if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map)) goto out; - } + if (map.br_startoff >= last_da) + goto out; + xfs_trim_extent(&map, map_off, last_da - map_off); - /* - * Read the directory block starting at the first mapping. - */ - mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff); - error = xfs_dir3_data_read(NULL, dp, map->br_startoff, - map->br_blockcount >= geo->fsbcount ? - XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) : - -1, &bp); - /* - * Should just skip over the data block instead of giving up. - */ + /* Read the directory block of that first mapping. */ + new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); + if (new_off > *cur_off) + *cur_off = new_off; + error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp); if (error) - goto out; /* XXX */ - - /* - * Adjust the current amount of read-ahead: we just read a block that - * was previously ra. - */ - if (mip->ra_current) - mip->ra_current -= geo->fsbcount; + goto out; /* - * Do we need more readahead? - * Each loop tries to process 1 full dir blk; last may be partial. + * Start readahead for the next bufsize's worth of dir data blocks. + * We may have already issued readahead for some of that range; + * ra_blk tracks the last block we tried to read(ahead). */ + ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); + if (*ra_blk >= last_da) + goto out; + else if (*ra_blk == 0) + *ra_blk = map.br_startoff; + next_ra = map.br_startoff + geo->fsbcount; + if (next_ra >= last_da) + goto out_no_ra; + if (map.br_blockcount < geo->fsbcount && + !xfs_iext_get_extent(ifp, ++idx, &map)) + goto out_no_ra; + if (map.br_startoff >= last_da) + goto out_no_ra; + xfs_trim_extent(&map, next_ra, last_da - next_ra); + + /* Start ra for each dir (not fs) block that has a mapping. */ blk_start_plug(&plug); - for (mip->ra_index = mip->ra_offset = i = 0; - mip->ra_want > mip->ra_current && i < mip->map_blocks; - i += geo->fsbcount) { - ASSERT(mip->ra_index < mip->map_valid); - /* - * Read-ahead a contiguous directory block. - */ - if (i > mip->ra_current && - (map[mip->ra_index].br_blockcount - mip->ra_offset) >= - geo->fsbcount) { - xfs_dir3_data_readahead(dp, - map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_FSB_TO_DADDR(dp->i_mount, - map[mip->ra_index].br_startblock + - mip->ra_offset)); - mip->ra_current = i; - } - - /* - * Read-ahead a non-contiguous directory block. This doesn't - * use our mapping, but this is a very rare case. - */ - else if (i > mip->ra_current) { - xfs_dir3_data_readahead(dp, - map[mip->ra_index].br_startoff + - mip->ra_offset, -1); - mip->ra_current = i; - } - - /* - * Advance offset through the mapping table, processing a full - * dir block even if it is fragmented into several extents. - * But stop if we have consumed all valid mappings, even if - * it's not yet a full directory block. - */ - for (j = 0; - j < geo->fsbcount && mip->ra_index < mip->map_valid; - j += length ) { - /* - * The rest of this extent but not more than a dir - * block. - */ - length = min_t(int, geo->fsbcount - j, - map[mip->ra_index].br_blockcount - - mip->ra_offset); - mip->ra_offset += length; - - /* - * Advance to the next mapping if this one is used up. - */ - if (mip->ra_offset == map[mip->ra_index].br_blockcount) { - mip->ra_offset = 0; - mip->ra_index++; + while (ra_want > 0) { + next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount); + while (ra_want > 0 && + next_ra < map.br_startoff + map.br_blockcount) { + if (next_ra >= last_da) { + *ra_blk = last_da; + break; } + if (next_ra > *ra_blk) { + xfs_dir3_data_readahead(dp, next_ra, -2); + *ra_blk = next_ra; + } + ra_want -= geo->fsbcount; + next_ra += geo->fsbcount; + } + if (!xfs_iext_get_extent(ifp, ++idx, &map)) { + *ra_blk = last_da; + break; } } blk_finish_plug(&plug); @@ -458,6 +344,9 @@ xfs_dir2_leaf_readbuf( out: *bpp = bp; return error; +out_no_ra: + *ra_blk = last_da; + goto out; } /* @@ -475,14 +364,14 @@ xfs_dir2_leaf_getdents( xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ - int error = 0; /* error return value */ - int length; /* temporary length value */ - int byteoff; /* offset in current block */ - xfs_dir2_off_t curoff; /* current overall offset */ - xfs_dir2_off_t newoff; /* new curoff after new blk */ char *ptr = NULL; /* pointer to current data */ - struct xfs_dir2_leaf_map_info *map_info; struct xfs_da_geometry *geo = args->geo; + xfs_dablk_t rablk = 0; /* current readahead block */ + xfs_dir2_off_t curoff; /* current overall offset */ + int length; /* temporary length value */ + int byteoff; /* offset in current block */ + int lock_mode; + int error = 0; /* error return value */ /* * If the offset is at or past the largest allowed value, @@ -492,73 +381,35 @@ xfs_dir2_leaf_getdents( return 0; /* - * Set up to bmap a number of blocks based on the caller's - * buffer size, the directory block size, and the filesystem - * block size. - */ - length = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); - map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + - (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP | KM_NOFS); - map_info->map_size = length; - - /* * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ curoff = xfs_dir2_dataptr_to_byte(ctx->pos); /* - * Force this conversion through db so we truncate the offset - * down to get the start of the data block. - */ - map_info->map_off = xfs_dir2_db_to_da(geo, - xfs_dir2_byte_to_db(geo, curoff)); - - /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. */ while (curoff < XFS_DIR2_LEAF_OFFSET) { - __uint8_t filetype; + uint8_t filetype; /* * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { - int lock_mode; - bool trim_map = false; - if (bp) { - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(args->trans, bp); bp = NULL; - trim_map = true; } lock_mode = xfs_ilock_data_map_shared(dp); - error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, - &curoff, &bp, trim_map); + error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, + &rablk, &bp); xfs_iunlock(dp, lock_mode); - if (error || !map_info->map_valid) + if (error || !bp) break; - /* - * Having done a read, we need to set a new offset. - */ - newoff = xfs_dir2_db_off_to_byte(geo, - map_info->curdb, 0); - /* - * Start of the current block. - */ - if (curoff < newoff) - curoff = newoff; - /* - * Make sure we're in the right block. - */ - else if (curoff > newoff) - ASSERT(xfs_dir2_byte_to_db(geo, curoff) == - map_info->curdb); hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* @@ -643,17 +494,22 @@ xfs_dir2_leaf_getdents( ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; - kmem_free(map_info); if (bp) - xfs_trans_brelse(NULL, bp); + xfs_trans_brelse(args->trans, bp); return error; } /* * Read a directory. + * + * If supplied, the transaction collects locked dir buffers to avoid + * nested buffer deadlocks. This function does not dirty the + * transaction. The caller should ensure that the inode is locked + * before calling this function. */ int xfs_readdir( + struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize) @@ -672,6 +528,7 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; + args.trans = tp; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 6a05d278da64..b2cde5426182 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -39,7 +39,7 @@ xfs_trim_extents( xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, - __uint64_t *blocks_trimmed) + uint64_t *blocks_trimmed) { struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; @@ -166,7 +166,7 @@ xfs_ioc_trim( struct fstrim_range range; xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; - __uint64_t blocks_trimmed = 0; + uint64_t blocks_trimmed = 0; int error, last_error = 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9d06cc30e875..f89f7b5241e6 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -276,7 +276,7 @@ xfs_qm_init_dquot_blk( void xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { - __uint64_t space; + uint64_t space; dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); @@ -472,18 +472,23 @@ xfs_qm_dqtobp( struct xfs_mount *mp = dqp->q_mount; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); struct xfs_trans *tp = (tpp ? *tpp : NULL); - uint lock_mode; + uint lock_mode = 0; quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags); dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; - lock_mode = xfs_ilock_data_map_shared(quotip); + ASSERT(!(flags & XFS_QMOPT_NOLOCK) || + xfs_isilocked(quotip, XFS_ILOCK_SHARED) || + xfs_isilocked(quotip, XFS_ILOCK_EXCL)); + if (!(flags & XFS_QMOPT_NOLOCK)) + lock_mode = xfs_ilock_data_map_shared(quotip); if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ - xfs_iunlock(quotip, lock_mode); + if (lock_mode) + xfs_iunlock(quotip, lock_mode); return -ESRCH; } @@ -493,7 +498,8 @@ xfs_qm_dqtobp( error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - xfs_iunlock(quotip, lock_mode); + if (lock_mode) + xfs_iunlock(quotip, lock_mode); if (error) return error; @@ -695,21 +701,18 @@ error0: */ static int xfs_dq_get_next_id( - xfs_mount_t *mp, + struct xfs_mount *mp, uint type, - xfs_dqid_t *id, - loff_t eof) + xfs_dqid_t *id) { - struct xfs_inode *quotip; + struct xfs_inode *quotip = xfs_quota_inode(mp, type); + xfs_dqid_t next_id = *id + 1; /* simple advance */ + uint lock_flags; + struct xfs_bmbt_irec got; + xfs_extnum_t idx; xfs_fsblock_t start; - loff_t offset; - uint lock; - xfs_dqid_t next_id; int error = 0; - /* Simple advance */ - next_id = *id + 1; - /* If we'd wrap past the max ID, stop */ if (next_id < *id) return -ENOENT; @@ -723,23 +726,25 @@ xfs_dq_get_next_id( /* Nope, next_id is now past the current chunk, so find the next one */ start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk; - quotip = xfs_quota_inode(mp, type); - lock = xfs_ilock_data_map_shared(quotip); - - offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start), - eof, SEEK_DATA); - if (offset < 0) - error = offset; + lock_flags = xfs_ilock_data_map_shared(quotip); + if (!(quotip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK); + if (error) + return error; + } - xfs_iunlock(quotip, lock); + if (xfs_iext_lookup_extent(quotip, "ip->i_df, start, &idx, &got)) { + /* contiguous chunk, bump startoff for the id calculation */ + if (got.br_startoff < start) + got.br_startoff = start; + *id = got.br_startoff * mp->m_quotainfo->qi_dqperchunk; + } else { + error = -ENOENT; + } - /* -ENXIO is essentially "no more data" */ - if (error) - return (error == -ENXIO ? -ENOENT: error); + xfs_iunlock(quotip, lock_flags); - /* Convert next data offset back to a quota id */ - *id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk; - return 0; + return error; } /* @@ -762,7 +767,6 @@ xfs_qm_dqget( struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; - loff_t eof = 0; int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -790,21 +794,6 @@ xfs_qm_dqget( } #endif - /* Get the end of the quota file if we need it */ - if (flags & XFS_QMOPT_DQNEXT) { - struct xfs_inode *quotip; - xfs_fileoff_t last; - uint lock_mode; - - quotip = xfs_quota_inode(mp, type); - lock_mode = xfs_ilock_data_map_shared(quotip); - error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK); - xfs_iunlock(quotip, lock_mode); - if (error) - return error; - eof = XFS_FSB_TO_B(mp, last); - } - restart: mutex_lock(&qi->qi_tree_lock); dqp = radix_tree_lookup(tree, id); @@ -823,7 +812,7 @@ restart: if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { xfs_dqunlock(dqp); mutex_unlock(&qi->qi_tree_lock); - error = xfs_dq_get_next_id(mp, type, &id, eof); + error = xfs_dq_get_next_id(mp, type, &id); if (error) return error; goto restart; @@ -858,7 +847,7 @@ restart: /* If we are asked to find next active id, keep looking */ if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) { - error = xfs_dq_get_next_id(mp, type, &id, eof); + error = xfs_dq_get_next_id(mp, type, &id); if (!error) goto restart; } @@ -917,7 +906,7 @@ restart: if (flags & XFS_QMOPT_DQNEXT) { if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { xfs_qm_dqput(dqp); - error = xfs_dq_get_next_id(mp, type, &id, eof); + error = xfs_dq_get_next_id(mp, type, &id); if (error) return error; goto restart; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ed7ee4e8af73..2f4feb959bfb 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -22,103 +22,280 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_error.h" +#include "xfs_sysfs.h" #ifdef DEBUG -int xfs_etest[XFS_NUM_INJECT_ERROR]; -int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; -char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; -int xfs_error_test_active; +static unsigned int xfs_errortag_random_default[] = { + XFS_RANDOM_DEFAULT, + XFS_RANDOM_IFLUSH_1, + XFS_RANDOM_IFLUSH_2, + XFS_RANDOM_IFLUSH_3, + XFS_RANDOM_IFLUSH_4, + XFS_RANDOM_IFLUSH_5, + XFS_RANDOM_IFLUSH_6, + XFS_RANDOM_DA_READ_BUF, + XFS_RANDOM_BTREE_CHECK_LBLOCK, + XFS_RANDOM_BTREE_CHECK_SBLOCK, + XFS_RANDOM_ALLOC_READ_AGF, + XFS_RANDOM_IALLOC_READ_AGI, + XFS_RANDOM_ITOBP_INOTOBP, + XFS_RANDOM_IUNLINK, + XFS_RANDOM_IUNLINK_REMOVE, + XFS_RANDOM_DIR_INO_VALIDATE, + XFS_RANDOM_BULKSTAT_READ_CHUNK, + XFS_RANDOM_IODONE_IOERR, + XFS_RANDOM_STRATREAD_IOERR, + XFS_RANDOM_STRATCMPL_IOERR, + XFS_RANDOM_DIOWRITE_IOERR, + XFS_RANDOM_BMAPIFORMAT, + XFS_RANDOM_FREE_EXTENT, + XFS_RANDOM_RMAP_FINISH_ONE, + XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, + XFS_RANDOM_REFCOUNT_FINISH_ONE, + XFS_RANDOM_BMAP_FINISH_ONE, + XFS_RANDOM_AG_RESV_CRITICAL, + XFS_RANDOM_DROP_WRITES, + XFS_RANDOM_LOG_BAD_CRC, +}; -int -xfs_error_test(int error_tag, int *fsidp, char *expression, - int line, char *file, unsigned long randfactor) +struct xfs_errortag_attr { + struct attribute attr; + unsigned int tag; +}; + +static inline struct xfs_errortag_attr * +to_attr(struct attribute *attr) { - int i; - int64_t fsid; + return container_of(attr, struct xfs_errortag_attr, attr); +} - if (prandom_u32() % randfactor) - return 0; +static inline struct xfs_mount * +to_mp(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); - memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); + return container_of(kobj, struct xfs_mount, m_errortag_kobj); +} + +STATIC ssize_t +xfs_errortag_attr_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_mount *mp = to_mp(kobject); + struct xfs_errortag_attr *xfs_attr = to_attr(attr); + int ret; + unsigned int val; - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { - xfs_warn(NULL, - "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, xfs_etest_fsname[i]); - return 1; - } + if (strcmp(buf, "default") == 0) { + val = xfs_errortag_random_default[xfs_attr->tag]; + } else { + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; } - return 0; + ret = xfs_errortag_set(mp, xfs_attr->tag, val); + if (ret) + return ret; + return count; } +STATIC ssize_t +xfs_errortag_attr_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_mount *mp = to_mp(kobject); + struct xfs_errortag_attr *xfs_attr = to_attr(attr); + + return snprintf(buf, PAGE_SIZE, "%u\n", + xfs_errortag_get(mp, xfs_attr->tag)); +} + +static const struct sysfs_ops xfs_errortag_sysfs_ops = { + .show = xfs_errortag_attr_show, + .store = xfs_errortag_attr_store, +}; + +#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ +static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ + .attr = {.name = __stringify(_name), \ + .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ + .tag = (_tag), \ +} + +#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr + +XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); +XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); +XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); +XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); +XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); +XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); +XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); +XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); +XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); +XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); +XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); +XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); +XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); +XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); +XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); +XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); +XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); +XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); +XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); +XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); +XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); +XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); +XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); +XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); +XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); +XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); +XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); +XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); +XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); +XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); + +static struct attribute *xfs_errortag_attrs[] = { + XFS_ERRORTAG_ATTR_LIST(noerror), + XFS_ERRORTAG_ATTR_LIST(iflush1), + XFS_ERRORTAG_ATTR_LIST(iflush2), + XFS_ERRORTAG_ATTR_LIST(iflush3), + XFS_ERRORTAG_ATTR_LIST(iflush4), + XFS_ERRORTAG_ATTR_LIST(iflush5), + XFS_ERRORTAG_ATTR_LIST(iflush6), + XFS_ERRORTAG_ATTR_LIST(dareadbuf), + XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), + XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), + XFS_ERRORTAG_ATTR_LIST(readagf), + XFS_ERRORTAG_ATTR_LIST(readagi), + XFS_ERRORTAG_ATTR_LIST(itobp), + XFS_ERRORTAG_ATTR_LIST(iunlink), + XFS_ERRORTAG_ATTR_LIST(iunlinkrm), + XFS_ERRORTAG_ATTR_LIST(dirinovalid), + XFS_ERRORTAG_ATTR_LIST(bulkstat), + XFS_ERRORTAG_ATTR_LIST(logiodone), + XFS_ERRORTAG_ATTR_LIST(stratread), + XFS_ERRORTAG_ATTR_LIST(stratcmpl), + XFS_ERRORTAG_ATTR_LIST(diowrite), + XFS_ERRORTAG_ATTR_LIST(bmapifmt), + XFS_ERRORTAG_ATTR_LIST(free_extent), + XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), + XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), + XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), + XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), + XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), + XFS_ERRORTAG_ATTR_LIST(drop_writes), + XFS_ERRORTAG_ATTR_LIST(log_bad_crc), + NULL, +}; + +struct kobj_type xfs_errortag_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_errortag_sysfs_ops, + .default_attrs = xfs_errortag_attrs, +}; + int -xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp) +xfs_errortag_init( + struct xfs_mount *mp) { - int i; - int len; - int64_t fsid; + mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, + KM_SLEEP | KM_MAYFAIL); + if (!mp->m_errortag) + return -ENOMEM; - if (error_tag >= XFS_ERRTAG_MAX) - return -EINVAL; + return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, + &mp->m_kobj, "errortag"); +} - memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); +void +xfs_errortag_del( + struct xfs_mount *mp) +{ + xfs_sysfs_del(&mp->m_errortag_kobj); + kmem_free(mp->m_errortag); +} - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { - xfs_warn(mp, "error tag #%d on", error_tag); - return 0; - } - } +bool +xfs_errortag_test( + struct xfs_mount *mp, + const char *expression, + const char *file, + int line, + unsigned int error_tag) +{ + unsigned int randfactor; - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest[i] == 0) { - xfs_warn(mp, "Turned on XFS error tag #%d", - error_tag); - xfs_etest[i] = error_tag; - xfs_etest_fsid[i] = fsid; - len = strlen(mp->m_fsname); - xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); - strcpy(xfs_etest_fsname[i], mp->m_fsname); - xfs_error_test_active++; - return 0; - } - } + /* + * To be able to use error injection anywhere, we need to ensure error + * injection mechanism is already initialized. + * + * Code paths like I/O completion can be called before the + * initialization is complete, but be able to inject errors in such + * places is still useful. + */ + if (!mp->m_errortag) + return false; - xfs_warn(mp, "error tag overflow, too many turned on"); + ASSERT(error_tag < XFS_ERRTAG_MAX); + randfactor = mp->m_errortag[error_tag]; + if (!randfactor || prandom_u32() % randfactor) + return false; - return 1; + xfs_warn_ratelimited(mp, +"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", + expression, file, line, mp->m_fsname); + return true; } int -xfs_errortag_clearall(xfs_mount_t *mp, int loud) +xfs_errortag_get( + struct xfs_mount *mp, + unsigned int error_tag) { - int64_t fsid; - int cleared = 0; - int i; - - memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); - - - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && - xfs_etest[i] != 0) { - cleared = 1; - xfs_warn(mp, "Clearing XFS error tag #%d", - xfs_etest[i]); - xfs_etest[i] = 0; - xfs_etest_fsid[i] = 0LL; - kmem_free(xfs_etest_fsname[i]); - xfs_etest_fsname[i] = NULL; - xfs_error_test_active--; - } - } + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + + return mp->m_errortag[error_tag]; +} + +int +xfs_errortag_set( + struct xfs_mount *mp, + unsigned int error_tag, + unsigned int tag_value) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; - if (loud || cleared) - xfs_warn(mp, "Cleared all XFS error tags for filesystem"); + mp->m_errortag[error_tag] = tag_value; + return 0; +} +int +xfs_errortag_add( + struct xfs_mount *mp, + unsigned int error_tag) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + + return xfs_errortag_set(mp, error_tag, + xfs_errortag_random_default[error_tag]); +} + +int +xfs_errortag_clearall( + struct xfs_mount *mp) +{ + memset(mp->m_errortag, 0, sizeof(unsigned int) * XFS_ERRTAG_MAX); return 0; } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 05f8666733a0..7577be5f09bc 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -96,7 +96,17 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 #define XFS_ERRTAG_BMAP_FINISH_ONE 26 #define XFS_ERRTAG_AG_RESV_CRITICAL 27 -#define XFS_ERRTAG_MAX 28 +/* + * DEBUG mode instrumentation to test and/or trigger delayed allocation + * block killing in the event of failed writes. When enabled, all + * buffered writes are silenty dropped and handled as if they failed. + * All delalloc blocks in the range of the write (including pre-existing + * delalloc blocks!) are tossed as part of the write failure error + * handling sequence. + */ +#define XFS_ERRTAG_DROP_WRITES 28 +#define XFS_ERRTAG_LOG_BAD_CRC 29 +#define XFS_ERRTAG_MAX 30 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -129,23 +139,29 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 #define XFS_RANDOM_BMAP_FINISH_ONE 1 #define XFS_RANDOM_AG_RESV_CRITICAL 4 +#define XFS_RANDOM_DROP_WRITES 1 +#define XFS_RANDOM_LOG_BAD_CRC 1 #ifdef DEBUG -extern int xfs_error_test_active; -extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); - -#define XFS_NUM_INJECT_ERROR 10 -#define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || (xfs_error_test_active && \ - xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ - (rf)))) +extern int xfs_errortag_init(struct xfs_mount *mp); +extern void xfs_errortag_del(struct xfs_mount *mp); +extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, + const char *file, int line, unsigned int error_tag); +#define XFS_TEST_ERROR(expr, mp, tag) \ + ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) -extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp); -extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); +extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); +extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, + unsigned int tag_value); +extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); +extern int xfs_errortag_clearall(struct xfs_mount *mp); #else -#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) -#define xfs_errortag_add(tag, mp) (ENOSYS) -#define xfs_errortag_clearall(mp, loud) (ENOSYS) +#define xfs_errortag_init(mp) (0) +#define xfs_errortag_del(mp) +#define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define xfs_errortag_set(mp, tag, val) (ENOSYS) +#define xfs_errortag_add(mp, tag) (ENOSYS) +#define xfs_errortag_clearall(mp) (ENOSYS) #endif /* DEBUG */ /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 35703a801372..c4893e226fd8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -140,7 +140,7 @@ xfs_file_fsync( trace_xfs_file_fsync(ip); - error = filemap_write_and_wait_range(inode->i_mapping, start, end); + error = file_write_and_wait_range(file, start, end); if (error) return error; @@ -237,7 +237,11 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -541,7 +545,11 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -553,9 +561,15 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to take the exclusive lock * for other reasons in xfs_file_aio_write_checks. */ - if (unaligned_io) - inode_dio_wait(inode); - else if (iolock == XFS_IOLOCK_EXCL) { + if (unaligned_io) { + /* If we are going to wait for other DIO to finish, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) { + if (atomic_read(&inode->i_dio_count)) + return -EAGAIN; + } else { + inode_dio_wait(inode); + } + } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -585,7 +599,12 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -660,6 +679,7 @@ write_retry: xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); + xfs_icache_free_cowblocks(ip->i_mount, &eofb); goto write_retry; } @@ -892,6 +912,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; + file->f_mode |= FMODE_AIO_NOWAIT; return 0; } @@ -950,395 +971,7 @@ xfs_file_readdir( */ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); - return xfs_readdir(ip, ctx, bufsize); -} - -/* - * This type is designed to indicate the type of offset we would like - * to search from page cache for xfs_seek_hole_data(). - */ -enum { - HOLE_OFF = 0, - DATA_OFF, -}; - -/* - * Lookup the desired type of offset from the given page. - * - * On success, return true and the offset argument will point to the - * start of the region that was found. Otherwise this function will - * return false and keep the offset argument unchanged. - */ -STATIC bool -xfs_lookup_buffer_offset( - struct page *page, - loff_t *offset, - unsigned int type) -{ - loff_t lastoff = page_offset(page); - bool found = false; - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - /* - * Unwritten extents that have data in the page - * cache covering them can be identified by the - * BH_Unwritten state flag. Pages with multiple - * buffers might have a mix of holes, data and - * unwritten extents - any buffer with valid - * data in it should have BH_Uptodate flag set - * on it. - */ - if (buffer_unwritten(bh) || - buffer_uptodate(bh)) { - if (type == DATA_OFF) - found = true; - } else { - if (type == HOLE_OFF) - found = true; - } - - if (found) { - *offset = lastoff; - break; - } - lastoff += bh->b_size; - } while ((bh = bh->b_this_page) != head); - - return found; -} - -/* - * This routine is called to find out and return a data or hole offset - * from the page cache for unwritten extents according to the desired - * type for xfs_seek_hole_data(). - * - * The argument offset is used to tell where we start to search from the - * page cache. Map is used to figure out the end points of the range to - * lookup pages. - * - * Return true if the desired type of offset was found, and the argument - * offset is filled with that address. Otherwise, return false and keep - * offset unchanged. - */ -STATIC bool -xfs_find_get_desired_pgoff( - struct inode *inode, - struct xfs_bmbt_irec *map, - unsigned int type, - loff_t *offset) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct pagevec pvec; - pgoff_t index; - pgoff_t end; - loff_t endoff; - loff_t startoff = *offset; - loff_t lastoff = startoff; - bool found = false; - - pagevec_init(&pvec, 0); - - index = startoff >> PAGE_SHIFT; - endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_SHIFT; - do { - int want; - unsigned nr_pages; - unsigned int i; - - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, - want); - /* - * No page mapped into given range. If we are searching holes - * and if this is the first time we got into the loop, it means - * that the given offset is landed in a hole, return it. - * - * If we have already stepped through some block buffers to find - * holes but they all contains data. In this case, the last - * offset is already updated and pointed to the end of the last - * mapped page, if it does not reach the endpoint to search, - * that means there should be a hole between them. - */ - if (nr_pages == 0) { - /* Data search found nothing */ - if (type == DATA_OFF) - break; - - ASSERT(type == HOLE_OFF); - if (lastoff == startoff || lastoff < endoff) { - found = true; - *offset = lastoff; - } - break; - } - - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; - break; - } - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - loff_t b_offset; - - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to tmpfs - * file mapping. However, page->index will not change - * because we have a reference on the page. - * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. - */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } - goto out; - } - - lock_page(page); - /* - * Page truncated or invalidated(page->mapping == NULL). - * We can freely skip it and proceed to check the next - * page. - */ - if (unlikely(page->mapping != inode->i_mapping)) { - unlock_page(page); - continue; - } - - if (!page_has_buffers(page)) { - unlock_page(page); - continue; - } - - found = xfs_lookup_buffer_offset(page, &b_offset, type); - if (found) { - /* - * The found offset may be less than the start - * point to search if this is the first time to - * come here. - */ - *offset = max_t(loff_t, startoff, b_offset); - unlock_page(page); - goto out; - } - - /* - * We either searching data but nothing was found, or - * searching hole but found a data buffer. In either - * case, probably the next page contains the desired - * things, update the last offset to it so. - */ - lastoff = page_offset(page) + PAGE_SIZE; - unlock_page(page); - } - - /* - * The number of returned pages less than our desired, search - * done. In this case, nothing was found for searching data, - * but we found a hole behind the last offset. - */ - if (nr_pages < want) { - if (type == HOLE_OFF) { - *offset = lastoff; - found = true; - } - break; - } - - index = pvec.pages[i - 1]->index + 1; - pagevec_release(&pvec); - } while (index <= end); - -out: - pagevec_release(&pvec); - return found; -} - -/* - * caller must lock inode with xfs_ilock_data_map_shared, - * can we craft an appropriate ASSERT? - * - * end is because the VFS-level lseek interface is defined such that any - * offset past i_size shall return -ENXIO, but we use this for quota code - * which does not maintain i_size, and we want to SEEK_DATA past i_size. - */ -loff_t -__xfs_seek_hole_data( - struct inode *inode, - loff_t start, - loff_t end, - int whence) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - loff_t uninitialized_var(offset); - xfs_fileoff_t fsbno; - xfs_filblks_t lastbno; - int error; - - if (start >= end) { - error = -ENXIO; - goto out_error; - } - - /* - * Try to read extents from the first block indicated - * by fsbno to the end block of the file. - */ - fsbno = XFS_B_TO_FSBT(mp, start); - lastbno = XFS_B_TO_FSB(mp, end); - - for (;;) { - struct xfs_bmbt_irec map[2]; - int nmap = 2; - unsigned int i; - - error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_error; - - /* No extents at given offset, must be beyond EOF */ - if (nmap == 0) { - error = -ENXIO; - goto out_error; - } - - for (i = 0; i < nmap; i++) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[i].br_startoff)); - - /* Landed in the hole we wanted? */ - if (whence == SEEK_HOLE && - map[i].br_startblock == HOLESTARTBLOCK) - goto out; - - /* Landed in the data extent we wanted? */ - if (whence == SEEK_DATA && - (map[i].br_startblock == DELAYSTARTBLOCK || - (map[i].br_state == XFS_EXT_NORM && - !isnullstartblock(map[i].br_startblock)))) - goto out; - - /* - * Landed in an unwritten extent, try to search - * for hole or data from page cache. - */ - if (map[i].br_state == XFS_EXT_UNWRITTEN) { - if (xfs_find_get_desired_pgoff(inode, &map[i], - whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF, - &offset)) - goto out; - } - } - - /* - * We only received one extent out of the two requested. This - * means we've hit EOF and didn't find what we are looking for. - */ - if (nmap == 1) { - /* - * If we were looking for a hole, set offset to - * the end of the file (i.e., there is an implicit - * hole at the end of any file). - */ - if (whence == SEEK_HOLE) { - offset = end; - break; - } - /* - * If we were looking for data, it's nowhere to be found - */ - ASSERT(whence == SEEK_DATA); - error = -ENXIO; - goto out_error; - } - - ASSERT(i > 1); - - /* - * Nothing was found, proceed to the next round of search - * if the next reading offset is not at or beyond EOF. - */ - fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; - start = XFS_FSB_TO_B(mp, fsbno); - if (start >= end) { - if (whence == SEEK_HOLE) { - offset = end; - break; - } - ASSERT(whence == SEEK_DATA); - error = -ENXIO; - goto out_error; - } - } - -out: - /* - * If at this point we have found the hole we wanted, the returned - * offset may be bigger than the file size as it may be aligned to - * page boundary for unwritten extents. We need to deal with this - * situation in particular. - */ - if (whence == SEEK_HOLE) - offset = min_t(loff_t, offset, end); - - return offset; - -out_error: - return error; -} - -STATIC loff_t -xfs_seek_hole_data( - struct file *file, - loff_t start, - int whence) -{ - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - uint lock; - loff_t offset, end; - int error = 0; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - lock = xfs_ilock_data_map_shared(ip); - - end = i_size_read(inode); - offset = __xfs_seek_hole_data(inode, start, end, whence); - if (offset < 0) { - error = offset; - goto out_unlock; - } - - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); - -out_unlock: - xfs_iunlock(ip, lock); - - if (error) - return error; - return offset; + return xfs_readdir(NULL, ip, ctx, bufsize); } STATIC loff_t @@ -1347,17 +980,25 @@ xfs_file_llseek( loff_t offset, int whence) { + struct inode *inode = file->f_mapping->host; + + if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount)) + return -EIO; + switch (whence) { - case SEEK_END: - case SEEK_CUR: - case SEEK_SET: + default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: + offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + break; case SEEK_DATA: - return xfs_seek_hole_data(file, offset, whence); - default: - return -EINVAL; + offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + break; } + + if (offset < 0) + return offset; + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } /* diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 3683819887a5..814ed729881d 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -828,6 +828,7 @@ xfs_getfsmap( struct xfs_fsmap dkeys[2]; /* per-dev keys */ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; struct xfs_getfsmap_info info = { NULL }; + bool use_rmap; int i; int error = 0; @@ -837,12 +838,14 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; + use_rmap = capable(CAP_SYS_ADMIN) && + xfs_sb_version_hasrmapbt(&mp->m_sb); head->fmh_entries = 0; /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else handlers[0].fn = xfs_getfsmap_datadev_bnobt; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 6ccaae9eb0ee..8f22fc579dbb 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -602,7 +602,7 @@ xfs_growfs_data_private( if (nagimax) mp->m_maxagi = nagimax; if (mp->m_sb.sb_imax_pct) { - __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; do_div(icount, 100); mp->m_maxicount = icount << mp->m_sb.sb_inopblog; } else @@ -793,17 +793,17 @@ xfs_fs_counts( int xfs_reserve_blocks( xfs_mount_t *mp, - __uint64_t *inval, + uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta; - __int64_t fdblks_delta = 0; - __uint64_t request; - __int64_t free; + int64_t lcounter, delta; + int64_t fdblks_delta = 0; + uint64_t request; + int64_t free; int error = 0; /* If inval is null, report current values and return */ - if (inval == (__uint64_t *)NULL) { + if (inval == (uint64_t *)NULL) { if (!outval) return -EINVAL; outval->resblks = mp->m_resblks; @@ -904,7 +904,7 @@ out: int xfs_fs_goingdown( xfs_mount_t *mp, - __uint32_t inflags) + uint32_t inflags) { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index f34915898fea..2954c13a3acd 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -22,9 +22,9 @@ extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion); extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); -extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, +extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, xfs_fsop_resblks_t *outval); -extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 687a4b01fc53..3e1cc3001bcb 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -47,4 +47,9 @@ xfs_param_t xfs_params = { struct xfs_globals xfs_globals = { .log_recovery_delay = 0, /* no delay by default */ +#ifdef XFS_ASSERT_FATAL + .bug_on_assert = true, /* assert failures BUG() */ +#else + .bug_on_assert = false, /* assert failures WARN() */ +#endif }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f61c84f8e31a..0a9e6985a0d0 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -66,7 +66,6 @@ xfs_inode_alloc( XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); @@ -190,7 +189,7 @@ xfs_perag_set_reclaim_tag( { struct xfs_mount *mp = pag->pag_mount; - ASSERT(spin_is_locked(&pag->pag_ici_lock)); + lockdep_assert_held(&pag->pag_ici_lock); if (pag->pag_ici_reclaimable++) return; @@ -212,7 +211,7 @@ xfs_perag_clear_reclaim_tag( { struct xfs_mount *mp = pag->pag_mount; - ASSERT(spin_is_locked(&pag->pag_ici_lock)); + lockdep_assert_held(&pag->pag_ici_lock); if (--pag->pag_ici_reclaimable) return; @@ -270,12 +269,12 @@ xfs_inew_wait( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (!xfs_iflags_test(ip, XFS_INEW)) break; schedule(); } while (true); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } /* @@ -369,6 +368,11 @@ xfs_iget_cache_hit( if (ip->i_flags & XFS_IRECLAIMABLE) { trace_xfs_iget_reclaim(ip); + if (flags & XFS_IGET_INCORE) { + error = -EAGAIN; + goto out_error; + } + /* * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode * from stomping over us while we recycle the inode. We can't @@ -433,7 +437,8 @@ xfs_iget_cache_hit( if (lock_flags != 0) xfs_ilock(ip, lock_flags); - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + if (!(flags & XFS_IGET_INCORE)) + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); XFS_STATS_INC(mp, xs_ig_found); return 0; @@ -604,6 +609,10 @@ again: goto out_error_or_again; } else { rcu_read_unlock(); + if (flags & XFS_IGET_INCORE) { + error = -ENOENT; + goto out_error_or_again; + } XFS_STATS_INC(mp, xs_ig_missed); error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, @@ -624,7 +633,7 @@ again: return 0; out_error_or_again: - if (error == -EAGAIN) { + if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { delay(1); goto again; } @@ -633,6 +642,44 @@ out_error_or_again: } /* + * "Is this a cached inode that's also allocated?" + * + * Look up an inode by number in the given file system. If the inode is + * in cache and isn't in purgatory, return 1 if the inode is allocated + * and 0 if it is not. For all other cases (not in cache, being torn + * down, etc.), return a negative error code. + * + * The caller has to prevent inode allocation and freeing activity, + * presumably by locking the AGI buffer. This is to ensure that an + * inode cannot transition from allocated to freed until the caller is + * ready to allow that. If the inode is in an intermediate state (new, + * reclaimable, or being reclaimed), -EAGAIN will be returned; if the + * inode is not in the cache, -ENOENT will be returned. The caller must + * deal with these scenarios appropriately. + * + * This is a specialized use case for the online scrubber; if you're + * reading this, you probably want xfs_iget. + */ +int +xfs_icache_inode_is_allocated( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t ino, + bool *inuse) +{ + struct xfs_inode *ip; + int error; + + error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); + if (error) + return error; + + *inuse = !!(VFS_I(ip)->i_mode); + IRELE(ip); + return 0; +} + +/* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between * lookup reduction and stack usage. This is in the reclaim path, so we can't diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 9183f77958ef..bff4d85e5498 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -47,6 +47,7 @@ struct xfs_eofblocks { #define XFS_IGET_CREATE 0x1 #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 +#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */ /* * flags for AG inode iterator @@ -126,4 +127,7 @@ xfs_fs_eofblocks_from_user( return 0; } +int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_ino_t ino, bool *inuse); + #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ec9826c56500..ceef77c0416a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -622,17 +622,17 @@ __xfs_iflock( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_isiflocked(ip)) io_schedule(); } while (!xfs_iflock_nowait(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } STATIC uint _xfs_dic2xflags( - __uint16_t di_flags, + uint16_t di_flags, uint64_t di_flags2, bool has_attr) { @@ -855,8 +855,8 @@ xfs_ialloc( inode->i_version = 1; ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; - ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec; + ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; + ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; } @@ -2486,11 +2486,11 @@ __xfs_iunpin_wait( xfs_iunpin(ip); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_ipincount(ip)) io_schedule(); } while (xfs_ipincount(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } void @@ -3489,7 +3489,7 @@ xfs_iflush_int( dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), - mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { + mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); @@ -3499,7 +3499,7 @@ xfs_iflush_int( if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { + mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr 0x%p", __func__, ip->i_ino, ip); @@ -3510,7 +3510,7 @@ xfs_iflush_int( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), - mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { + mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr 0x%p", __func__, ip->i_ino, ip); @@ -3518,8 +3518,7 @@ xfs_iflush_int( } } if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > - ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, - XFS_RANDOM_IFLUSH_5)) { + ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr 0x%p", @@ -3529,7 +3528,7 @@ xfs_iflush_int( goto corrupt_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, - mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { + mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", __func__, ip->i_ino, ip->i_d.di_forkoff, ip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 10e89fcb49d7..0ee453de239a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -192,8 +192,8 @@ static inline void xfs_set_projid(struct xfs_inode *ip, prid_t projid) { - ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16); - ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); + ip->i_d.di_projid_hi = (uint16_t) (projid >> 16); + ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff); } static inline prid_t @@ -445,9 +445,6 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, bool *did_zero); -loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, - loff_t eof, int whence); - /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 08cb7d1a4a3a..013cc78d7daf 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -834,9 +834,7 @@ xfs_inode_item_format_convert( in_f->ilf_dsize = in_f32->ilf_dsize; in_f->ilf_ino = in_f32->ilf_ino; /* copy biggest field of ilf_u */ - memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, - in_f32->ilf_u.ilfu_uuid.__u_bits, - sizeof(uuid_t)); + uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); in_f->ilf_blkno = in_f32->ilf_blkno; in_f->ilf_len = in_f32->ilf_len; in_f->ilf_boffset = in_f32->ilf_boffset; @@ -851,9 +849,7 @@ xfs_inode_item_format_convert( in_f->ilf_dsize = in_f64->ilf_dsize; in_f->ilf_ino = in_f64->ilf_ino; /* copy biggest field of ilf_u */ - memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, - in_f64->ilf_u.ilfu_uuid.__u_bits, - sizeof(uuid_t)); + uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid); in_f->ilf_blkno = in_f64->ilf_blkno; in_f->ilf_len = in_f64->ilf_len; in_f->ilf_boffset = in_f64->ilf_boffset; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6190697603c9..9c0c7a920304 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -120,8 +120,7 @@ xfs_find_handle( handle.ha_fid.fid_pad = 0; handle.ha_fid.fid_gen = inode->i_generation; handle.ha_fid.fid_ino = ip->i_ino; - - hsize = XFS_HSIZE(handle); + hsize = sizeof(xfs_handle_t); } error = -EFAULT; @@ -444,8 +443,8 @@ xfs_attrmulti_attr_get( struct inode *inode, unsigned char *name, unsigned char __user *ubuf, - __uint32_t *len, - __uint32_t flags) + uint32_t *len, + uint32_t flags) { unsigned char *kbuf; int error = -EFAULT; @@ -473,8 +472,8 @@ xfs_attrmulti_attr_set( struct inode *inode, unsigned char *name, const unsigned char __user *ubuf, - __uint32_t len, - __uint32_t flags) + uint32_t len, + uint32_t flags) { unsigned char *kbuf; int error; @@ -499,7 +498,7 @@ int xfs_attrmulti_attr_remove( struct inode *inode, unsigned char *name, - __uint32_t flags) + uint32_t flags) { int error; @@ -877,7 +876,7 @@ xfs_merge_ioc_xflags( STATIC unsigned int xfs_di2lxflags( - __uint16_t di_flags) + uint16_t di_flags) { unsigned int flags = 0; @@ -1288,7 +1287,7 @@ xfs_ioctl_setattr_check_projid( struct fsxattr *fa) { /* Disallow 32bit project ids if projid32bit feature is not enabled. */ - if (fa->fsx_projid > (__uint16_t)-1 && + if (fa->fsx_projid > (uint16_t)-1 && !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) return -EINVAL; @@ -1932,7 +1931,7 @@ xfs_file_ioctl( case XFS_IOC_SET_RESBLKS: { xfs_fsop_resblks_t inout; - __uint64_t in; + uint64_t in; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2018,12 +2017,12 @@ xfs_file_ioctl( } case XFS_IOC_GOINGDOWN: { - __uint32_t in; + uint32_t in; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (get_user(in, (__uint32_t __user *)arg)) + if (get_user(in, (uint32_t __user *)arg)) return -EFAULT; return xfs_fs_goingdown(mp, in); @@ -2038,14 +2037,14 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; - return xfs_errortag_add(in.errtag, mp); + return xfs_errortag_add(mp, in.errtag); } case XFS_IOC_ERROR_CLEARALL: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return xfs_errortag_clearall(mp, 1); + return xfs_errortag_clearall(mp); case XFS_IOC_FREE_EOFBLOCKS: { struct xfs_fs_eofblocks eofb; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 8b52881bfd90..e86c3ea137d2 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -48,22 +48,22 @@ xfs_attrmulti_attr_get( struct inode *inode, unsigned char *name, unsigned char __user *ubuf, - __uint32_t *len, - __uint32_t flags); + uint32_t *len, + uint32_t flags); extern int xfs_attrmulti_attr_set( struct inode *inode, unsigned char *name, const unsigned char __user *ubuf, - __uint32_t len, - __uint32_t flags); + uint32_t len, + uint32_t flags); extern int xfs_attrmulti_attr_remove( struct inode *inode, unsigned char *name, - __uint32_t flags); + uint32_t flags); extern struct dentry * xfs_handle_to_dentry( diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index b1bb45444df8..5492bcf6f442 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -112,9 +112,9 @@ typedef struct compat_xfs_fsop_handlereq { /* The bstat field in the swapext struct needs translation */ typedef struct compat_xfs_swapext { - __int64_t sx_version; /* version */ - __int64_t sx_fdtarget; /* fd of target file */ - __int64_t sx_fdtmp; /* fd of tmp file */ + int64_t sx_version; /* version */ + int64_t sx_fdtarget; /* fd of target file */ + int64_t sx_fdtmp; /* fd of tmp file */ xfs_off_t sx_offset; /* offset into file */ xfs_off_t sx_length; /* leng from offset */ char sx_pad[16]; /* pad space, unused */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a63f61c256bd..813394c62849 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -543,7 +543,7 @@ xfs_file_iomap_begin_delay( if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_unlock; @@ -995,6 +995,11 @@ xfs_file_iomap_begin( lockmode = xfs_ilock_data_map_shared(ip); } + if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = -EAGAIN; + goto out_unlock; + } + ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) length = mp->m_super->s_maxbytes - offset; @@ -1016,6 +1021,15 @@ xfs_file_iomap_begin( if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { if (flags & IOMAP_DIRECT) { + /* + * A reflinked inode will result in CoW alloc. + * FIXME: It could still overwrite on unshared extents + * and not need allocation. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode); @@ -1033,6 +1047,14 @@ xfs_file_iomap_begin( if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { /* + * If nowait is set bail since we are going to make + * allocations. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } + /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric * with the work writeback does. This is a completely arbitrary @@ -1068,7 +1090,7 @@ xfs_file_iomap_begin( /* optionally associate a dax device with the iomap bdev */ bdev = iomap->bdev; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; @@ -1097,7 +1119,7 @@ xfs_file_iomap_end_delalloc( * Behave as if the write failed if drop writes is enabled. Set the NEW * flag to force delalloc cleanup. */ - if (xfs_mp_drop_writes(mp)) { + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { iomap->flags |= IOMAP_F_NEW; written = 0; } @@ -1149,7 +1171,7 @@ xfs_file_iomap_end( unsigned flags, struct iomap *iomap) { - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, length, written, iomap); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ebfc13350f9a..469c9fa4c178 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -190,12 +190,12 @@ xfs_generic_create( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } @@ -460,7 +460,7 @@ xfs_vn_get_link( if (!dentry) return ERR_PTR(-ECHILD); - link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + link = kmalloc(XFS_SYMLINK_MAXLEN+1, GFP_KERNEL); if (!link) goto out_err; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 26d67ce3c18d..c393a2f6d8c3 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -31,7 +31,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" -STATIC int +int xfs_internal_inum( xfs_mount_t *mp, xfs_ino_t ino) diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 6ea8b3912fa4..17e86e0541af 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -96,4 +96,6 @@ xfs_inumbers( void __user *buffer, /* buffer with inode info */ inumbers_fmt_pf formatter); +int xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); + #endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 044fb0e15390..9301c5a6060b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -19,18 +19,11 @@ #define __XFS_LINUX__ #include <linux/types.h> +#include <linux/uuid.h> /* * Kernel specific type declarations for XFS */ -typedef signed char __int8_t; -typedef unsigned char __uint8_t; -typedef signed short int __int16_t; -typedef unsigned short int __uint16_t; -typedef signed int __int32_t; -typedef unsigned int __uint32_t; -typedef signed long long int __int64_t; -typedef unsigned long long int __uint64_t; typedef __s64 xfs_off_t; /* <file offset> type */ typedef unsigned long long xfs_ino_t; /* <inode> type */ @@ -42,7 +35,6 @@ typedef __u32 xfs_nlink_t; #include "kmem.h" #include "mrlock.h" -#include "uuid.h" #include <linux/semaphore.h> #include <linux/mm.h> @@ -151,7 +143,6 @@ typedef __u32 xfs_nlink_t; #define __return_address __builtin_return_address(0) #define XFS_PROJID_DEFAULT 0 -#define MAXPATHLEN 1024 #define MIN(a,b) (min(a,b)) #define MAX(a,b) (max(a,b)) @@ -186,22 +177,22 @@ extern struct xstats xfsstats; * are converting to the init_user_ns. The uid is later mapped to a particular * user namespace value when crossing the kernel/user boundary. */ -static inline __uint32_t xfs_kuid_to_uid(kuid_t uid) +static inline uint32_t xfs_kuid_to_uid(kuid_t uid) { return from_kuid(&init_user_ns, uid); } -static inline kuid_t xfs_uid_to_kuid(__uint32_t uid) +static inline kuid_t xfs_uid_to_kuid(uint32_t uid) { return make_kuid(&init_user_ns, uid); } -static inline __uint32_t xfs_kgid_to_gid(kgid_t gid) +static inline uint32_t xfs_kgid_to_gid(kgid_t gid) { return from_kgid(&init_user_ns, gid); } -static inline kgid_t xfs_gid_to_kgid(__uint32_t gid) +static inline kgid_t xfs_gid_to_kgid(uint32_t gid) { return make_kgid(&init_user_ns, gid); } @@ -231,14 +222,14 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n) #define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) -static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) +static inline uint64_t roundup_64(uint64_t x, uint32_t y) { x += y - 1; do_div(x, y); return x * y; } -static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) +static inline uint64_t howmany_64(uint64_t x, uint32_t y) { x += y - 1; do_div(x, y); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3731f13f63e9..0053bcf2b10a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -434,7 +434,7 @@ xfs_log_reserve( int unit_bytes, int cnt, struct xlog_ticket **ticp, - __uint8_t client, + uint8_t client, bool permanent) { struct xlog *log = mp->m_log; @@ -825,9 +825,9 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (!error) { /* the data section must be 32 bit size aligned */ struct { - __uint16_t magic; - __uint16_t pad1; - __uint32_t pad2; /* may as well make it 64 bits */ + uint16_t magic; + uint16_t pad1; + uint32_t pad2; /* may as well make it 64 bits */ } magic = { .magic = XLOG_UNMOUNT_TYPE, }; @@ -1189,8 +1189,7 @@ xlog_iodone(xfs_buf_t *bp) * IOABORT state. The IOABORT state is only set in DEBUG mode to inject * CRC errors into log recovery. */ - if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, - XFS_RANDOM_IODONE_IOERR) || + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) || iclog->ic_state & XLOG_STATE_IOABORT) { if (iclog->ic_state & XLOG_STATE_IOABORT) iclog->ic_state &= ~XLOG_STATE_IOABORT; @@ -1665,7 +1664,7 @@ xlog_cksum( char *dp, int size) { - __uint32_t crc; + uint32_t crc; /* first generate the crc for the record header ... */ crc = xfs_start_cksum_update((char *)rhead, @@ -1828,7 +1827,7 @@ xlog_sync( */ dptr = (char *)&iclog->ic_header + count; for (i = 0; i < split; i += BBSIZE) { - __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); if (++cycle == XLOG_HEADER_MAGIC_NUM) cycle++; *(__be32 *)dptr = cpu_to_be32(cycle); @@ -1842,7 +1841,6 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, iclog->ic_datap, size); -#ifdef DEBUG /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the @@ -1850,15 +1848,13 @@ xlog_sync( * write on I/O completion and shutdown the fs. The subsequent mount * detects the bad CRC and attempts to recover. */ - if (log->l_badcrc_factor && - (prandom_u32() % log->l_badcrc_factor == 0)) { + if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_state |= XLOG_STATE_IOABORT; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", be64_to_cpu(iclog->ic_header.h_lsn)); } -#endif bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; @@ -2024,7 +2020,7 @@ xlog_print_tic_res( }; #undef REG_TYPE_STR - xfs_warn(mp, "xlog_write: reservation summary:"); + xfs_warn(mp, "ticket reservation summary:"); xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); xfs_warn(mp, " current res = %d bytes", @@ -2045,10 +2041,55 @@ xlog_print_tic_res( "bad-rtype" : res_type_str[r_type]), ticket->t_res_arr[i].r_len); } +} + +/* + * Print a summary of the transaction. + */ +void +xlog_print_trans( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item_desc *lidp; + + /* dump core transaction and ticket info */ + xfs_warn(mp, "transaction summary:"); + xfs_warn(mp, " flags = 0x%x", tp->t_flags); + + xlog_print_tic_res(mp, tp->t_ticket); - xfs_alert_tag(mp, XFS_PTAG_LOGRES, - "xlog_write: reservation ran out. Need to up reservation"); - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + /* dump each log item */ + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv = lip->li_lv; + struct xfs_log_iovec *vec; + int i; + + xfs_warn(mp, "log item: "); + xfs_warn(mp, " type = 0x%x", lip->li_type); + xfs_warn(mp, " flags = 0x%x", lip->li_flags); + if (!lv) + continue; + xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); + xfs_warn(mp, " size = %d", lv->lv_size); + xfs_warn(mp, " bytes = %d", lv->lv_bytes); + xfs_warn(mp, " buf len = %d", lv->lv_buf_len); + + /* dump each iovec for the log item */ + vec = lv->lv_iovecp; + for (i = 0; i < lv->lv_niovecs; i++) { + int dumplen = min(vec->i_len, 32); + + xfs_warn(mp, " iovec[%d]", i); + xfs_warn(mp, " type = 0x%x", vec->i_type); + xfs_warn(mp, " len = %d", vec->i_len); + xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i); + xfs_hex_dump(vec->i_addr, dumplen); + + vec++; + } + } } /* @@ -2321,8 +2362,12 @@ xlog_write( if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) ticket->t_curr_res -= sizeof(xlog_op_header_t); - if (ticket->t_curr_res < 0) + if (ticket->t_curr_res < 0) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + } index = 0; lv = log_vector; @@ -2363,8 +2408,8 @@ xlog_write( } reg = &vecp[index]; - ASSERT(reg->i_len % sizeof(__int32_t) == 0); - ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + ASSERT(reg->i_len % sizeof(int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); start_rec_copy = xlog_write_start_rec(ptr, ticket); if (start_rec_copy) { @@ -3143,7 +3188,7 @@ xlog_state_switch_iclogs( /* Round up to next log-sunit */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { - __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); + uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); log->l_curr_block = roundup(log->l_curr_block, sunit_bb); } @@ -3771,7 +3816,7 @@ xlog_verify_iclog( xlog_in_core_2_t *xhdr; void *base_ptr, *ptr, *p; ptrdiff_t field_offset; - __uint8_t clientid; + uint8_t clientid; int len, i, j, k, op_len; int idx; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index cc5a9f1574e7..bf212772595c 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -159,7 +159,7 @@ int xfs_log_reserve(struct xfs_mount *mp, int length, int count, struct xlog_ticket **ticket, - __uint8_t clientid, + uint8_t clientid, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 82f1cbcc4de1..fbe72b134bef 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -410,6 +410,7 @@ xlog_cil_insert_items( int len = 0; int diff_iovecs = 0; int iclog_space; + int iovhdr_res = 0, split_res = 0, ctx_res = 0; ASSERT(tp); @@ -419,30 +420,11 @@ xlog_cil_insert_items( */ xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); - /* - * Now (re-)position everything modified at the tail of the CIL. - * We do this here so we only need to take the CIL lock once during - * the transaction commit. - */ spin_lock(&cil->xc_cil_lock); - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; - - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* - * Only move the item if it isn't already at the tail. This is - * to prevent a transient list_empty() state when reinserting - * an item that is already the only item in the CIL. - */ - if (!list_is_last(&lip->li_cil, &cil->xc_cil)) - list_move_tail(&lip->li_cil, &cil->xc_cil); - } /* account for space used by new iovec headers */ - len += diff_iovecs * sizeof(xlog_op_header_t); + iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t); + len += iovhdr_res; ctx->nvecs += diff_iovecs; /* attach the transaction to the CIL if it has any busy extents */ @@ -457,28 +439,66 @@ xlog_cil_insert_items( * during the transaction commit. */ if (ctx->ticket->t_curr_res == 0) { - ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; - tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res; + ctx_res = ctx->ticket->t_unit_res; + ctx->ticket->t_curr_res = ctx_res; + tp->t_ticket->t_curr_res -= ctx_res; } /* do we need space for more log record headers? */ iclog_space = log->l_iclog_size - log->l_iclog_hsize; if (len > 0 && (ctx->space_used / iclog_space != (ctx->space_used + len) / iclog_space)) { - int hdrs; - - hdrs = (len + iclog_space - 1) / iclog_space; + split_res = (len + iclog_space - 1) / iclog_space; /* need to take into account split region headers, too */ - hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); - ctx->ticket->t_unit_res += hdrs; - ctx->ticket->t_curr_res += hdrs; - tp->t_ticket->t_curr_res -= hdrs; + split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += split_res; + ctx->ticket->t_curr_res += split_res; + tp->t_ticket->t_curr_res -= split_res; ASSERT(tp->t_ticket->t_curr_res >= len); } tp->t_ticket->t_curr_res -= len; ctx->space_used += len; + /* + * If we've overrun the reservation, dump the tx details before we move + * the log items. Shutdown is imminent... + */ + if (WARN_ON(tp->t_ticket->t_curr_res < 0)) { + xfs_warn(log->l_mp, "Transaction log reservation overrun:"); + xfs_warn(log->l_mp, + " log items: %d bytes (iov hdrs: %d bytes)", + len, iovhdr_res); + xfs_warn(log->l_mp, " split region headers: %d bytes", + split_res); + xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res); + xlog_print_trans(tp); + } + + /* + * Now (re-)position everything modified at the tail of the CIL. + * We do this here so we only need to take the CIL lock once during + * the transaction commit. + */ + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* + * Only move the item if it isn't already at the tail. This is + * to prevent a transient list_empty() state when reinserting + * an item that is already the only item in the CIL. + */ + if (!list_is_last(&lip->li_cil, &cil->xc_cil)) + list_move_tail(&lip->li_cil, &cil->xc_cil); + } + spin_unlock(&cil->xc_cil_lock); + + if (tp->t_ticket->t_curr_res < 0) + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } static void @@ -973,6 +993,7 @@ xfs_log_commit_cil( { struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + xfs_lsn_t xc_commit_lsn; /* * Do all necessary memory allocation before we lock the CIL. @@ -986,13 +1007,9 @@ xfs_log_commit_cil( xlog_cil_insert_items(log, tp); - /* check we didn't blow the reservation */ - if (tp->t_ticket->t_curr_res < 0) - xlog_print_tic_res(mp, tp->t_ticket); - - tp->t_commit_lsn = cil->xc_ctx->sequence; + xc_commit_lsn = cil->xc_ctx->sequence; if (commit_lsn) - *commit_lsn = tp->t_commit_lsn; + *commit_lsn = xc_commit_lsn; xfs_log_done(mp, tp->t_ticket, NULL, regrant); xfs_trans_unreserve_and_mod_sb(tp); @@ -1008,7 +1025,7 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, tp->t_commit_lsn, false); + xfs_trans_free_items(tp, xc_commit_lsn, false); xlog_cil_push_background(log); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c2604a5366f2..51bf7b827387 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -419,7 +419,7 @@ struct xlog { }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) @@ -456,6 +456,7 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) } void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +void xlog_print_trans(struct xfs_trans *); int xlog_write( struct xlog *log, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4a98762ec8b4..9549188f5a36 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -352,13 +352,13 @@ xlog_header_check_mount( { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); - if (uuid_is_nil(&head->h_fs_uuid)) { + if (uuid_is_null(&head->h_fs_uuid)) { /* * IRIX doesn't write the h_fs_uuid or h_fmt fields. If - * h_fs_uuid is nil, we assume this log was last mounted + * h_fs_uuid is null, we assume this log was last mounted * by IRIX and continue. */ - xfs_warn(mp, "nil uuid in log - IRIX style log"); + xfs_warn(mp, "null uuid in log - IRIX style log"); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); @@ -2230,9 +2230,9 @@ xlog_recover_get_buf_lsn( struct xfs_mount *mp, struct xfs_buf *bp) { - __uint32_t magic32; - __uint16_t magic16; - __uint16_t magicda; + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; void *blk = bp->b_addr; uuid_t *uuid; xfs_lsn_t lsn = -1; @@ -2381,9 +2381,9 @@ xlog_recover_validate_buf_type( xfs_lsn_t current_lsn) { struct xfs_da_blkinfo *info = bp->b_addr; - __uint32_t magic32; - __uint16_t magic16; - __uint16_t magicda; + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; char *warnmsg = NULL; /* @@ -2852,7 +2852,7 @@ xlog_recover_buffer_pass2( if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)log->l_mp->m_inode_cluster_size))) { + (uint32_t)log->l_mp->m_inode_cluster_size))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { @@ -3423,7 +3423,7 @@ xlog_recover_efd_pass2( xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; xfs_log_item_t *lip; - __uint64_t efi_id; + uint64_t efi_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -3519,7 +3519,7 @@ xlog_recover_rud_pass2( struct xfs_rud_log_format *rud_formatp; struct xfs_rui_log_item *ruip = NULL; struct xfs_log_item *lip; - __uint64_t rui_id; + uint64_t rui_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -3635,7 +3635,7 @@ xlog_recover_cud_pass2( struct xfs_cud_log_format *cud_formatp; struct xfs_cui_log_item *cuip = NULL; struct xfs_log_item *lip; - __uint64_t cui_id; + uint64_t cui_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -3754,7 +3754,7 @@ xlog_recover_bud_pass2( struct xfs_bud_log_format *bud_formatp; struct xfs_bui_log_item *buip = NULL; struct xfs_log_item *lip; - __uint64_t bui_id; + uint64_t bui_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -3796,7 +3796,7 @@ xlog_recover_bud_pass2( * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that - * match the range to be intialised, stamped with inode templates and written + * match the range to be initialised, stamped with inode templates and written * by delayed write so that subsequent modifications will hit the cached buffer * and only need writing out at the end of recovery. */ @@ -4152,7 +4152,7 @@ xlog_recover_commit_trans( #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 - hlist_del(&trans->r_list); + hlist_del_init(&trans->r_list); error = xlog_recover_reorder_trans(log, trans, pass); if (error) @@ -4354,6 +4354,8 @@ xlog_recover_free_trans( xlog_recover_item_t *item, *n; int i; + hlist_del_init(&trans->r_list); + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { /* Free the regions in the item. */ list_del(&item->ri_list); @@ -5224,12 +5226,16 @@ xlog_do_recovery_pass( int error2 = 0; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; + int i; struct hlist_head rhash[XLOG_RHASH_SIZE]; LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); rhead_blk = 0; + for (i = 0; i < XLOG_RHASH_SIZE; i++) + INIT_HLIST_HEAD(&rhash[i]); + /* * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. @@ -5466,6 +5472,19 @@ xlog_do_recovery_pass( if (error && first_bad) *first_bad = rhead_blk; + /* + * Transactions are freed at commit time but transactions without commit + * records on disk are never committed. Free any that may be left in the + * hash table. + */ + for (i = 0; i < XLOG_RHASH_SIZE; i++) { + struct hlist_node *tmp; + struct xlog_recover *trans; + + hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list) + xlog_recover_free_trans(trans); + } + return error ? error : error2; } @@ -5772,9 +5791,9 @@ xlog_recover_check_summary( xfs_buf_t *agfbp; xfs_buf_t *agibp; xfs_agnumber_t agno; - __uint64_t freeblks; - __uint64_t itotal; - __uint64_t ifree; + uint64_t freeblks; + uint64_t itotal; + uint64_t ifree; int error; mp = log->l_mp; diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 11792d888e4e..e68bd1050eab 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -110,7 +110,10 @@ assfail(char *expr, char *file, int line) { xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", expr, file, line); - BUG(); + if (xfs_globals.bug_on_assert) + BUG(); + else + WARN_ON(1); } void diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2eaf81859166..40d4e8b4e193 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -74,20 +74,19 @@ xfs_uuid_mount( int hole, i; /* Publish UUID in struct super_block */ - BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t)); - memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t)); + uuid_copy(&mp->m_super->s_uuid, uuid); if (mp->m_flags & XFS_MOUNT_NOUUID) return 0; - if (uuid_is_nil(uuid)) { - xfs_warn(mp, "Filesystem has nil UUID - can't mount"); + if (uuid_is_null(uuid)) { + xfs_warn(mp, "Filesystem has null UUID - can't mount"); return -EINVAL; } mutex_lock(&xfs_uuid_table_mutex); for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { - if (uuid_is_nil(&xfs_uuid_table[i])) { + if (uuid_is_null(&xfs_uuid_table[i])) { hole = i; continue; } @@ -124,7 +123,7 @@ xfs_uuid_unmount( mutex_lock(&xfs_uuid_table_mutex); for (i = 0; i < xfs_uuid_table_size; i++) { - if (uuid_is_nil(&xfs_uuid_table[i])) + if (uuid_is_null(&xfs_uuid_table[i])) continue; if (!uuid_equal(uuid, &xfs_uuid_table[i])) continue; @@ -174,7 +173,7 @@ xfs_free_perag( int xfs_sb_validate_fsb_count( xfs_sb_t *sbp, - __uint64_t nblocks) + uint64_t nblocks) { ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); ASSERT(sbp->sb_blocklog >= BBSHIFT); @@ -436,7 +435,7 @@ STATIC void xfs_set_maxicount(xfs_mount_t *mp) { xfs_sb_t *sbp = &(mp->m_sb); - __uint64_t icount; + uint64_t icount; if (sbp->sb_imax_pct) { /* @@ -502,7 +501,7 @@ xfs_set_low_space_thresholds( int i; for (i = 0; i < XFS_LOWSP_MAX; i++) { - __uint64_t space = mp->m_sb.sb_dblocks; + uint64_t space = mp->m_sb.sb_dblocks; do_div(space, 100); mp->m_low_space[i] = space * (i + 1); @@ -598,10 +597,10 @@ xfs_mount_reset_sbqflags( return xfs_sync_sb(mp, false); } -__uint64_t +uint64_t xfs_default_resblks(xfs_mount_t *mp) { - __uint64_t resblks; + uint64_t resblks; /* * We default to 5% or 8192 fsbs of space reserved, whichever is @@ -612,7 +611,7 @@ xfs_default_resblks(xfs_mount_t *mp) */ resblks = mp->m_sb.sb_dblocks; do_div(resblks, 20); - resblks = min_t(__uint64_t, resblks, 8192); + resblks = min_t(uint64_t, resblks, 8192); return resblks; } @@ -632,7 +631,7 @@ xfs_mountfs( { struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; - __uint64_t resblks; + uint64_t resblks; uint quotamount = 0; uint quotaflags = 0; int error = 0; @@ -720,10 +719,13 @@ xfs_mountfs( if (error) goto out_del_stats; + error = xfs_errortag_init(mp); + if (error) + goto out_remove_error_sysfs; error = xfs_uuid_mount(mp); if (error) - goto out_remove_error_sysfs; + goto out_remove_errortag; /* * Set the minimum read and write sizes @@ -793,7 +795,10 @@ xfs_mountfs( * Copies the low order bits of the timestamp and the randomly * set "sequence" number out of a UUID. */ - uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + mp->m_fixedfsid[0] = + (get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) | + get_unaligned_be16(&sbp->sb_uuid.b[4]); + mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]); mp->m_dmevmask = 0; /* not persistent; set after each mount */ @@ -1042,6 +1047,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_remove_errortag: + xfs_errortag_del(mp); out_remove_error_sysfs: xfs_error_sysfs_del(mp); out_del_stats: @@ -1060,7 +1067,7 @@ void xfs_unmountfs( struct xfs_mount *mp) { - __uint64_t resblks; + uint64_t resblks; int error; cancel_delayed_work_sync(&mp->m_eofblocks_work); @@ -1145,10 +1152,11 @@ xfs_unmountfs( xfs_uuid_unmount(mp); #if defined(DEBUG) - xfs_errortag_clearall(mp, 0); + xfs_errortag_clearall(mp); #endif xfs_free_perag(mp); + xfs_errortag_del(mp); xfs_error_sysfs_del(mp); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); @@ -1209,7 +1217,7 @@ xfs_mod_icount( struct xfs_mount *mp, int64_t delta) { - __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH); if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { ASSERT(0); percpu_counter_add(&mp->m_icount, -delta); @@ -1288,7 +1296,7 @@ xfs_mod_fdblocks( else batch = XFS_FDBLOCKS_BATCH; - __percpu_counter_add(&mp->m_fdblocks, delta, batch); + percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9fa312a41c93..e0792d036be2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -108,10 +108,10 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ - __uint8_t m_blkbit_log; /* blocklog + NBBY */ - __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ - __uint8_t m_agno_log; /* log #ag's */ - __uint8_t m_agino_log; /* #bits for agino in inum */ + uint8_t m_blkbit_log; /* blocklog + NBBY */ + uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ + uint8_t m_agno_log; /* log #ag's */ + uint8_t m_agino_log; /* #bits for agino in inum */ uint m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ @@ -139,7 +139,7 @@ typedef struct xfs_mount { struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ - __uint64_t m_flags; /* global mount flags */ + uint64_t m_flags; /* global mount flags */ bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ @@ -148,14 +148,14 @@ typedef struct xfs_mount { int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ struct xfs_trans_resv m_resv; /* precomputed res values */ - __uint64_t m_maxicount; /* maximum inode count */ - __uint64_t m_resblks; /* total reserved blocks */ - __uint64_t m_resblks_avail;/* available reserved blocks */ - __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ + uint64_t m_maxicount; /* maximum inode count */ + uint64_t m_resblks; /* total reserved blocks */ + uint64_t m_resblks_avail;/* available reserved blocks */ + uint64_t m_resblks_save; /* reserved blks @ remount,ro */ int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ - __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ @@ -194,19 +194,17 @@ typedef struct xfs_mount { * ever support shrinks it would have to be persisted in addition * to various other kinds of pain inflicted on the pNFS server. */ - __uint32_t m_generation; + uint32_t m_generation; bool m_fail_unmount; #ifdef DEBUG /* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. + * Frequency with which errors are injected. Replaces xfs_etest; the + * value stored in here is the inverse of the frequency with which the + * error triggers. 1 = always, 2 = half the time, etc. */ - bool m_drop_writes; + unsigned int *m_errortag; + struct xfs_kobj m_errortag_kobj; #endif } xfs_mount_t; @@ -325,20 +323,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } -#ifdef DEBUG -static inline bool -xfs_mp_drop_writes(struct xfs_mount *mp) -{ - return mp->m_drop_writes; -} -#else -static inline bool -xfs_mp_drop_writes(struct xfs_mount *mp) -{ - return 0; -} -#endif - /* per-AG block reservation data structures*/ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, @@ -367,12 +351,12 @@ typedef struct xfs_perag { char pagi_init; /* this agi's entry is initialized */ char pagf_metadata; /* the agf is preferred to be metadata */ char pagi_inodeok; /* The agi is ok for inodes */ - __uint8_t pagf_levels[XFS_BTNUM_AGF]; + uint8_t pagf_levels[XFS_BTNUM_AGF]; /* # of levels in bno & cnt btree */ - __uint32_t pagf_flcount; /* count of blocks in freelist */ + uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ - __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ xfs_agino_t pagi_freecount; /* number of free inodes */ xfs_agino_t pagi_count; /* number of allocated inodes */ @@ -411,7 +395,7 @@ typedef struct xfs_perag { struct xfs_ag_resv pag_agfl_resv; /* reference count */ - __uint8_t pagf_refcount_level; + uint8_t pagf_refcount_level; } xfs_perag_t; static inline struct xfs_ag_resv * @@ -434,7 +418,7 @@ void xfs_buf_hash_destroy(xfs_perag_t *pag); extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); -extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); +extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); @@ -450,7 +434,7 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern bool xfs_fs_writable(struct xfs_mount *mp, int level); -extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5fe6e70b88ef..6ce948c436d5 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1247,6 +1247,7 @@ xfs_qm_flush_one( struct xfs_dquot *dqp, void *data) { + struct xfs_mount *mp = dqp->q_mount; struct list_head *buffer_list = data; struct xfs_buf *bp = NULL; int error = 0; @@ -1257,7 +1258,32 @@ xfs_qm_flush_one( if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - xfs_dqflock(dqp); + /* + * The only way the dquot is already flush locked by the time quotacheck + * gets here is if reclaim flushed it before the dqadjust walk dirtied + * it for the final time. Quotacheck collects all dquot bufs in the + * local delwri queue before dquots are dirtied, so reclaim can't have + * possibly queued it for I/O. The only way out is to push the buffer to + * cycle the flush lock. + */ + if (!xfs_dqflock_nowait(dqp)) { + /* buf is pinned in-core by delwri list */ + DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen); + bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); + if (!bp) { + error = -EINVAL; + goto out_unlock; + } + xfs_buf_unlock(bp); + + xfs_buf_delwri_pushbuf(bp, buffer_list); + xfs_buf_rele(bp); + + error = -EAGAIN; + goto out_unlock; + } + error = xfs_qm_dqflush(dqp, &bp); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 3e52d5de7ae1..2be6d2735ca9 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -33,7 +33,7 @@ xfs_fill_statvfs_from_dquot( struct kstatfs *statp, struct xfs_dquot *dqp) { - __uint64_t limit; + uint64_t limit; limit = dqp->q_core.d_blk_softlimit ? be64_to_cpu(dqp->q_core.d_blk_softlimit) : diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index f82d79a8c694..de9493253edf 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -269,7 +269,6 @@ xfs_fs_get_nextdqblk( /* ID may be different, so convert back what we got */ *qid = make_kqid(current_user_ns(), qid->type, id); return 0; - } STATIC int diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ffe6fe7a7eb5..ab2270a87196 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -155,6 +155,7 @@ int xfs_reflink_find_shared( struct xfs_mount *mp, + struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, @@ -166,18 +167,18 @@ xfs_reflink_find_shared( struct xfs_btree_cur *cur; int error; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); return error; } @@ -217,7 +218,7 @@ xfs_reflink_trim_around_shared( agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); aglen = irec->br_blockcount; - error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, + error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, aglen, &fbno, &flen, true); if (error) return error; @@ -1373,8 +1374,8 @@ xfs_reflink_dirty_extents( agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); aglen = map[1].br_blockcount; - error = xfs_reflink_find_shared(mp, agno, agbno, aglen, - &rbno, &rlen, true); + error = xfs_reflink_find_shared(mp, NULL, agno, agbno, + aglen, &rbno, &rlen, true); if (error) goto out; if (rbno == NULLAGBLOCK) @@ -1405,57 +1406,73 @@ out: return error; } -/* Clear the inode reflink flag if there are no shared extents. */ +/* Does this inode need the reflink flag? */ int -xfs_reflink_clear_inode_flag( - struct xfs_inode *ip, - struct xfs_trans **tpp) +xfs_reflink_inode_has_shared_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + bool *has_shared) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t fbno; - xfs_filblks_t end; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; - struct xfs_bmbt_irec map; - int nmaps; - int error = 0; - - ASSERT(xfs_is_reflink_inode(ip)); + struct xfs_bmbt_irec got; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + xfs_extnum_t idx; + bool found; + int error; - fbno = 0; - end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); - while (end - fbno > 0) { - nmaps = 1; - /* - * Look for extents in the file. Skip holes, delalloc, or - * unwritten extents; they can't be reflinked. - */ - error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) return error; - if (nmaps == 0) - break; - if (!xfs_bmap_is_real_extent(&map)) - goto next; + } - agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); - aglen = map.br_blockcount; + *has_shared = false; + found = xfs_iext_lookup_extent(ip, ifp, 0, &idx, &got); + while (found) { + if (isnullstartblock(got.br_startblock) || + got.br_state != XFS_EXT_NORM) + goto next; + agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); + aglen = got.br_blockcount; - error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, &rbno, &rlen, false); if (error) return error; /* Is there still a shared block here? */ - if (rbno != NULLAGBLOCK) + if (rbno != NULLAGBLOCK) { + *has_shared = true; return 0; + } next: - fbno = map.br_startoff + map.br_blockcount; + found = xfs_iext_get_extent(ifp, ++idx, &got); } + return 0; +} + +/* Clear the inode reflink flag if there are no shared extents. */ +int +xfs_reflink_clear_inode_flag( + struct xfs_inode *ip, + struct xfs_trans **tpp) +{ + bool needs_flag; + int error = 0; + + ASSERT(xfs_is_reflink_inode(ip)); + + error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); + if (error || needs_flag) + return error; + /* * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index d29a7967f029..701487bab468 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -20,9 +20,9 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 -extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, - xfs_extlen_t *flen, bool find_maximal); +extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, + xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); @@ -47,6 +47,8 @@ extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); +extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, + struct xfs_inode *ip, bool *has_shared); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, struct xfs_trans **tpp); extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index c57aa7f18087..91472193643b 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1256,13 +1256,13 @@ xfs_rtpick_extent( { xfs_rtblock_t b; /* result block */ int log2; /* log of sequence number */ - __uint64_t resid; /* residual after log removed */ - __uint64_t seq; /* sequence number of file creation */ - __uint64_t *seqp; /* pointer to seqno in inode */ + uint64_t resid; /* residual after log removed */ + uint64_t seq; /* sequence number of file creation */ + uint64_t *seqp; /* pointer to seqno in inode */ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; + seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; *seqp = 0; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index f13133e6f19f..79defa722bf1 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -107,6 +107,8 @@ xfs_growfs_rt( /* * From xfs_rtbitmap.c */ +int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t block, int issum, struct xfs_buf **bpp); int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val, xfs_rtblock_t *new, int *stat); @@ -143,6 +145,7 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp, # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) # define xfs_rtalloc_query_all(t,f,p) (ENOSYS) +# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f11282c96887..056e12b421eb 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -33,9 +33,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { int i, j; int len = 0; - __uint64_t xs_xstrat_bytes = 0; - __uint64_t xs_write_bytes = 0; - __uint64_t xs_read_bytes = 0; + uint64_t xs_xstrat_bytes = 0; + uint64_t xs_write_bytes = 0; + uint64_t xs_read_bytes = 0; static const struct xstats_entry { char *desc; @@ -100,7 +100,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) void xfs_stats_clearall(struct xfsstats __percpu *stats) { int c; - __uint32_t vn_active; + uint32_t vn_active; xfs_notice(NULL, "Clearing xfsstats"); for_each_possible_cpu(c) { diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 375840f5a99a..f64d0ae345c4 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -54,125 +54,125 @@ enum { */ struct __xfsstats { # define XFSSTAT_END_EXTENT_ALLOC 4 - __uint32_t xs_allocx; - __uint32_t xs_allocb; - __uint32_t xs_freex; - __uint32_t xs_freeb; + uint32_t xs_allocx; + uint32_t xs_allocb; + uint32_t xs_freex; + uint32_t xs_freeb; # define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) - __uint32_t xs_abt_lookup; - __uint32_t xs_abt_compare; - __uint32_t xs_abt_insrec; - __uint32_t xs_abt_delrec; + uint32_t xs_abt_lookup; + uint32_t xs_abt_compare; + uint32_t xs_abt_insrec; + uint32_t xs_abt_delrec; # define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) - __uint32_t xs_blk_mapr; - __uint32_t xs_blk_mapw; - __uint32_t xs_blk_unmap; - __uint32_t xs_add_exlist; - __uint32_t xs_del_exlist; - __uint32_t xs_look_exlist; - __uint32_t xs_cmp_exlist; + uint32_t xs_blk_mapr; + uint32_t xs_blk_mapw; + uint32_t xs_blk_unmap; + uint32_t xs_add_exlist; + uint32_t xs_del_exlist; + uint32_t xs_look_exlist; + uint32_t xs_cmp_exlist; # define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) - __uint32_t xs_bmbt_lookup; - __uint32_t xs_bmbt_compare; - __uint32_t xs_bmbt_insrec; - __uint32_t xs_bmbt_delrec; + uint32_t xs_bmbt_lookup; + uint32_t xs_bmbt_compare; + uint32_t xs_bmbt_insrec; + uint32_t xs_bmbt_delrec; # define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) - __uint32_t xs_dir_lookup; - __uint32_t xs_dir_create; - __uint32_t xs_dir_remove; - __uint32_t xs_dir_getdents; + uint32_t xs_dir_lookup; + uint32_t xs_dir_create; + uint32_t xs_dir_remove; + uint32_t xs_dir_getdents; # define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) - __uint32_t xs_trans_sync; - __uint32_t xs_trans_async; - __uint32_t xs_trans_empty; + uint32_t xs_trans_sync; + uint32_t xs_trans_async; + uint32_t xs_trans_empty; # define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) - __uint32_t xs_ig_attempts; - __uint32_t xs_ig_found; - __uint32_t xs_ig_frecycle; - __uint32_t xs_ig_missed; - __uint32_t xs_ig_dup; - __uint32_t xs_ig_reclaims; - __uint32_t xs_ig_attrchg; + uint32_t xs_ig_attempts; + uint32_t xs_ig_found; + uint32_t xs_ig_frecycle; + uint32_t xs_ig_missed; + uint32_t xs_ig_dup; + uint32_t xs_ig_reclaims; + uint32_t xs_ig_attrchg; # define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) - __uint32_t xs_log_writes; - __uint32_t xs_log_blocks; - __uint32_t xs_log_noiclogs; - __uint32_t xs_log_force; - __uint32_t xs_log_force_sleep; + uint32_t xs_log_writes; + uint32_t xs_log_blocks; + uint32_t xs_log_noiclogs; + uint32_t xs_log_force; + uint32_t xs_log_force_sleep; # define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) - __uint32_t xs_try_logspace; - __uint32_t xs_sleep_logspace; - __uint32_t xs_push_ail; - __uint32_t xs_push_ail_success; - __uint32_t xs_push_ail_pushbuf; - __uint32_t xs_push_ail_pinned; - __uint32_t xs_push_ail_locked; - __uint32_t xs_push_ail_flushing; - __uint32_t xs_push_ail_restarts; - __uint32_t xs_push_ail_flush; + uint32_t xs_try_logspace; + uint32_t xs_sleep_logspace; + uint32_t xs_push_ail; + uint32_t xs_push_ail_success; + uint32_t xs_push_ail_pushbuf; + uint32_t xs_push_ail_pinned; + uint32_t xs_push_ail_locked; + uint32_t xs_push_ail_flushing; + uint32_t xs_push_ail_restarts; + uint32_t xs_push_ail_flush; # define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) - __uint32_t xs_xstrat_quick; - __uint32_t xs_xstrat_split; + uint32_t xs_xstrat_quick; + uint32_t xs_xstrat_split; # define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) - __uint32_t xs_write_calls; - __uint32_t xs_read_calls; + uint32_t xs_write_calls; + uint32_t xs_read_calls; # define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) - __uint32_t xs_attr_get; - __uint32_t xs_attr_set; - __uint32_t xs_attr_remove; - __uint32_t xs_attr_list; + uint32_t xs_attr_get; + uint32_t xs_attr_set; + uint32_t xs_attr_remove; + uint32_t xs_attr_list; # define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) - __uint32_t xs_iflush_count; - __uint32_t xs_icluster_flushcnt; - __uint32_t xs_icluster_flushinode; + uint32_t xs_iflush_count; + uint32_t xs_icluster_flushcnt; + uint32_t xs_icluster_flushinode; # define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) - __uint32_t vn_active; /* # vnodes not on free lists */ - __uint32_t vn_alloc; /* # times vn_alloc called */ - __uint32_t vn_get; /* # times vn_get called */ - __uint32_t vn_hold; /* # times vn_hold called */ - __uint32_t vn_rele; /* # times vn_rele called */ - __uint32_t vn_reclaim; /* # times vn_reclaim called */ - __uint32_t vn_remove; /* # times vn_remove called */ - __uint32_t vn_free; /* # times vn_free called */ + uint32_t vn_active; /* # vnodes not on free lists */ + uint32_t vn_alloc; /* # times vn_alloc called */ + uint32_t vn_get; /* # times vn_get called */ + uint32_t vn_hold; /* # times vn_hold called */ + uint32_t vn_rele; /* # times vn_rele called */ + uint32_t vn_reclaim; /* # times vn_reclaim called */ + uint32_t vn_remove; /* # times vn_remove called */ + uint32_t vn_free; /* # times vn_free called */ #define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) - __uint32_t xb_get; - __uint32_t xb_create; - __uint32_t xb_get_locked; - __uint32_t xb_get_locked_waited; - __uint32_t xb_busy_locked; - __uint32_t xb_miss_locked; - __uint32_t xb_page_retries; - __uint32_t xb_page_found; - __uint32_t xb_get_read; + uint32_t xb_get; + uint32_t xb_create; + uint32_t xb_get_locked; + uint32_t xb_get_locked_waited; + uint32_t xb_busy_locked; + uint32_t xb_miss_locked; + uint32_t xb_page_retries; + uint32_t xb_page_found; + uint32_t xb_get_read; /* Version 2 btree counters */ #define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) - __uint32_t xs_abtb_2[__XBTS_MAX]; + uint32_t xs_abtb_2[__XBTS_MAX]; #define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) - __uint32_t xs_abtc_2[__XBTS_MAX]; + uint32_t xs_abtc_2[__XBTS_MAX]; #define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) - __uint32_t xs_bmbt_2[__XBTS_MAX]; + uint32_t xs_bmbt_2[__XBTS_MAX]; #define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) - __uint32_t xs_ibt_2[__XBTS_MAX]; + uint32_t xs_ibt_2[__XBTS_MAX]; #define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) - __uint32_t xs_fibt_2[__XBTS_MAX]; + uint32_t xs_fibt_2[__XBTS_MAX]; #define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) - __uint32_t xs_rmap_2[__XBTS_MAX]; + uint32_t xs_rmap_2[__XBTS_MAX]; #define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) - __uint32_t xs_refcbt_2[__XBTS_MAX]; + uint32_t xs_refcbt_2[__XBTS_MAX]; #define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) - __uint32_t xs_qm_dqreclaims; - __uint32_t xs_qm_dqreclaim_misses; - __uint32_t xs_qm_dquot_dups; - __uint32_t xs_qm_dqcachemisses; - __uint32_t xs_qm_dqcachehits; - __uint32_t xs_qm_dqwants; + uint32_t xs_qm_dqreclaims; + uint32_t xs_qm_dqreclaim_misses; + uint32_t xs_qm_dquot_dups; + uint32_t xs_qm_dqcachemisses; + uint32_t xs_qm_dqcachehits; + uint32_t xs_qm_dqwants; #define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) - __uint32_t xs_qm_dquot; - __uint32_t xs_qm_dquot_unused; + uint32_t xs_qm_dquot; + uint32_t xs_qm_dquot_unused; /* Extra precision counters */ - __uint64_t xs_xstrat_bytes; - __uint64_t xs_write_bytes; - __uint64_t xs_read_bytes; + uint64_t xs_xstrat_bytes; + uint64_t xs_write_bytes; + uint64_t xs_read_bytes; }; struct xfsstats { @@ -186,7 +186,7 @@ struct xfsstats { * simple wrapper for getting the array index of s struct member offset */ #define XFS_STATS_CALC_INDEX(member) \ - (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t)) + (offsetof(struct __xfsstats, member) / (int)sizeof(uint32_t)) int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 47d239dcf3f4..38aaacdbb8b3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -52,6 +52,7 @@ #include "xfs_reflink.h" #include <linux/namei.h> +#include <linux/dax.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/mount.h> @@ -195,7 +196,7 @@ xfs_parseargs( int dsunit = 0; int dswidth = 0; int iosize = 0; - __uint8_t iosizelog = 0; + uint8_t iosizelog = 0; /* * set up the mount name first so all the errors will refer to the @@ -555,7 +556,7 @@ xfs_showargs( return 0; } -static __uint64_t +static uint64_t xfs_max_file_offset( unsigned int blockshift) { @@ -586,7 +587,7 @@ xfs_max_file_offset( # endif #endif - return (((__uint64_t)pagefactor) << bitshift) - 1; + return (((uint64_t)pagefactor) << bitshift) - 1; } /* @@ -621,7 +622,7 @@ xfs_set_inode_alloc( * the max inode percentage. Used only for inode32. */ if (mp->m_maxicount) { - __uint64_t icount; + uint64_t icount; icount = sbp->sb_dblocks * sbp->sb_imax_pct; do_div(icount, 100); @@ -1087,12 +1088,12 @@ xfs_fs_statfs( struct xfs_mount *mp = XFS_M(dentry->d_sb); xfs_sb_t *sbp = &mp->m_sb; struct xfs_inode *ip = XFS_I(d_inode(dentry)); - __uint64_t fakeinos, id; - __uint64_t icount; - __uint64_t ifree; - __uint64_t fdblocks; + uint64_t fakeinos, id; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; xfs_extlen_t lsize; - __int64_t ffree; + int64_t ffree; statp->f_type = XFS_SB_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1115,7 +1116,7 @@ xfs_fs_statfs( statp->f_bavail = statp->f_bfree; fakeinos = statp->f_bfree << sbp->sb_inopblog; - statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + statp->f_files = MIN(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), statp->f_files, @@ -1128,7 +1129,7 @@ xfs_fs_statfs( /* make sure statp->f_ffree does not underflow */ ffree = statp->f_files - (icount - ifree); - statp->f_ffree = max_t(__int64_t, ffree, 0); + statp->f_ffree = max_t(int64_t, ffree, 0); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && @@ -1141,7 +1142,7 @@ xfs_fs_statfs( STATIC void xfs_save_resvblks(struct xfs_mount *mp) { - __uint64_t resblks = 0; + uint64_t resblks = 0; mp->m_resblks_save = mp->m_resblks; xfs_reserve_blocks(mp, &resblks, NULL); @@ -1150,7 +1151,7 @@ xfs_save_resvblks(struct xfs_mount *mp) STATIC void xfs_restore_resvblks(struct xfs_mount *mp) { - __uint64_t resblks; + uint64_t resblks; if (mp->m_resblks_save) { resblks = mp->m_resblks_save; @@ -1765,7 +1766,8 @@ STATIC int __init xfs_init_zones(void) { xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, - offsetof(struct xfs_ioend, io_inline_bio)); + offsetof(struct xfs_ioend, io_inline_bio), + BIOSET_NEED_BVECS); if (!xfs_ioend_bioset) goto out; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index f2cb45ed1d54..12cd9cf7de41 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -43,8 +43,8 @@ #include "xfs_log.h" /* ----- Kernel only functions below ----- */ -STATIC int -xfs_readlink_bmap( +int +xfs_readlink_bmap_ilocked( struct xfs_inode *ip, char *link) { @@ -143,7 +143,7 @@ xfs_readlink( if (!pathlen) goto out; - if (pathlen < 0 || pathlen > MAXPATHLEN) { + if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", __func__, (unsigned long long) ip->i_ino, (long long) pathlen); @@ -153,7 +153,7 @@ xfs_readlink( } - error = xfs_readlink_bmap(ip, link); + error = xfs_readlink_bmap_ilocked(ip, link); out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -202,7 +202,7 @@ xfs_symlink( * Check component lengths of the target path name. */ pathlen = strlen(target_path); - if (pathlen >= MAXPATHLEN) /* total string too long */ + if (pathlen >= XFS_SYMLINK_MAXLEN) /* total string too long */ return -ENAMETOOLONG; udqp = gdqp = NULL; @@ -559,7 +559,7 @@ xfs_inactive_symlink( return 0; } - if (pathlen < 0 || pathlen > MAXPATHLEN) { + if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", __func__, (unsigned long long)ip->i_ino, pathlen); xfs_iunlock(ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index e75245d09116..aeaee8923617 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -21,6 +21,7 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_inactive_symlink(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 984a3499cfe3..82afee005140 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -95,6 +95,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ + bool bug_on_assert; /* BUG() the kernel on assert failure */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 80ac15fb9638..8b2ccc234f36 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -90,15 +90,25 @@ to_mp(struct kobject *kobject) return container_of(kobj, struct xfs_mount, m_kobj); } +static struct attribute *xfs_mp_attrs[] = { + NULL, +}; + +struct kobj_type xfs_mp_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_mp_attrs, +}; + #ifdef DEBUG +/* debug */ STATIC ssize_t -drop_writes_store( +bug_on_assert_store( struct kobject *kobject, const char *buf, size_t count) { - struct xfs_mount *mp = to_mp(kobject); int ret; int val; @@ -107,9 +117,9 @@ drop_writes_store( return ret; if (val == 1) - mp->m_drop_writes = true; + xfs_globals.bug_on_assert = true; else if (val == 0) - mp->m_drop_writes = false; + xfs_globals.bug_on_assert = false; else return -EINVAL; @@ -117,33 +127,13 @@ drop_writes_store( } STATIC ssize_t -drop_writes_show( +bug_on_assert_show( struct kobject *kobject, char *buf) { - struct xfs_mount *mp = to_mp(kobject); - - return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_drop_writes ? 1 : 0); + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0); } -XFS_SYSFS_ATTR_RW(drop_writes); - -#endif /* DEBUG */ - -static struct attribute *xfs_mp_attrs[] = { -#ifdef DEBUG - ATTR_LIST(drop_writes), -#endif - NULL, -}; - -struct kobj_type xfs_mp_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_mp_attrs, -}; - -#ifdef DEBUG -/* debug */ +XFS_SYSFS_ATTR_RW(bug_on_assert); STATIC ssize_t log_recovery_delay_store( @@ -176,6 +166,7 @@ log_recovery_delay_show( XFS_SYSFS_ATTR_RW(log_recovery_delay); static struct attribute *xfs_dbg_attrs[] = { + ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), NULL, }; @@ -314,47 +305,11 @@ write_grant_head_show( } XFS_SYSFS_ATTR_RO(write_grant_head); -#ifdef DEBUG -STATIC ssize_t -log_badcrc_factor_store( - struct kobject *kobject, - const char *buf, - size_t count) -{ - struct xlog *log = to_xlog(kobject); - int ret; - uint32_t val; - - ret = kstrtouint(buf, 0, &val); - if (ret) - return ret; - - log->l_badcrc_factor = val; - - return count; -} - -STATIC ssize_t -log_badcrc_factor_show( - struct kobject *kobject, - char *buf) -{ - struct xlog *log = to_xlog(kobject); - - return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor); -} - -XFS_SYSFS_ATTR_RW(log_badcrc_factor); -#endif /* DEBUG */ - static struct attribute *xfs_log_attrs[] = { ATTR_LIST(log_head_lsn), ATTR_LIST(log_tail_lsn), ATTR_LIST(reserve_grant_head), ATTR_LIST(write_grant_head), -#ifdef DEBUG - ATTR_LIST(log_badcrc_factor), -#endif NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7c5a16528d8b..bcc3cdf8e1c5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -251,7 +251,7 @@ TRACE_EVENT(xfs_iext_insert, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - (__int64_t)__entry->startblock, + (int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -295,7 +295,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - (__int64_t)__entry->startblock, + (int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -367,6 +367,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); +DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); @@ -1280,7 +1281,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->count, __print_symbolic(__entry->type, XFS_IO_TYPES), __entry->startoff, - (__int64_t)__entry->startblock, + (int64_t)__entry->startblock, __entry->blockcount) ) @@ -1490,25 +1491,6 @@ TRACE_EVENT(xfs_extent_busy_trim, __entry->tlen) ); -TRACE_EVENT(xfs_trans_commit_lsn, - TP_PROTO(struct xfs_trans *trans), - TP_ARGS(trans), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(struct xfs_trans *, tp) - __field(xfs_lsn_t, lsn) - ), - TP_fast_assign( - __entry->dev = trans->t_mountp->m_super->s_dev; - __entry->tp = trans; - __entry->lsn = trans->t_commit_lsn; - ), - TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->tp, - __entry->lsn) -); - TRACE_EVENT(xfs_agf, TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, unsigned long caller_ip), @@ -2057,7 +2039,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, TP_ARGS(log, buf_f), TP_STRUCT__entry( __field(dev_t, dev) - __field(__int64_t, blkno) + __field(int64_t, blkno) __field(unsigned short, len) __field(unsigned short, flags) __field(unsigned short, size) @@ -2106,7 +2088,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, __field(int, fields) __field(unsigned short, asize) __field(unsigned short, dsize) - __field(__int64_t, blkno) + __field(int64_t, blkno) __field(int, len) __field(int, boffset) ), @@ -3256,8 +3238,8 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, __field(xfs_agnumber_t, agno) __field(xfs_fsblock_t, bno) __field(xfs_filblks_t, len) - __field(__uint64_t, owner) - __field(__uint64_t, offset) + __field(uint64_t, owner) + __field(uint64_t, offset) __field(unsigned int, flags) ), TP_fast_assign( @@ -3297,9 +3279,9 @@ DECLARE_EVENT_CLASS(xfs_getfsmap_class, __field(dev_t, keydev) __field(xfs_daddr_t, block) __field(xfs_daddr_t, len) - __field(__uint64_t, owner) - __field(__uint64_t, offset) - __field(__uint64_t, flags) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(uint64_t, flags) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a07acbf0bd8a..6bdad6f58934 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -105,10 +105,6 @@ typedef struct xfs_trans { unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ struct xlog_ticket *t_ticket; /* log mgr ticket */ - xfs_lsn_t t_lsn; /* log seq num of start of - * transaction. */ - xfs_lsn_t t_commit_lsn; /* log seq num of end of - * transaction. */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ unsigned int t_flags; /* misc flags */ @@ -249,7 +245,7 @@ struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp, struct xfs_rui_log_item *ruip); int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type, - __uint64_t owner, int whichfork, xfs_fileoff_t startoff, + uint64_t owner, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); @@ -275,6 +271,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, enum xfs_bmap_intent_type type, struct xfs_inode *ip, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, xfs_exntst_t state); + xfs_filblks_t *blockcount, xfs_exntst_t state); #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 6408e7d7c08c..14543d93cd4b 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update( int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t blockcount, + xfs_filblks_t *blockcount, xfs_exntst_t state) { int error; @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item( void **state) { struct xfs_bmap_intent *bmap; + xfs_filblks_t count; int error; bmap = container_of(item, struct xfs_bmap_intent, bi_list); + count = bmap->bi_bmap.br_blockcount; error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, bmap->bi_type, bmap->bi_owner, bmap->bi_whichfork, bmap->bi_bmap.br_startoff, bmap->bi_bmap.br_startblock, - bmap->bi_bmap.br_blockcount, + &count, bmap->bi_bmap.br_state); + if (!error && count > 0) { + ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); + bmap->bi_bmap.br_blockcount = count; + return -EAGAIN; + } kmem_free(bmap); return error; } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 8ee29ca132dc..86987d823d76 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp, xfs_buf_t *bp) { xfs_buf_log_item_t *bip; + int freed; /* * Default to a normal brelse() call if the tp is NULL. @@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp, /* * Drop our reference to the buf log item. */ - atomic_dec(&bip->bli_refcount); + freed = atomic_dec_and_test(&bip->bli_refcount); /* - * If the buf item is not tracking data in the log, then - * we must free it before releasing the buffer back to the - * free pool. Before releasing the buffer to the free pool, - * clear the transaction pointer in b_fsprivate2 to dissolve - * its relation to this transaction. + * If the buf item is not tracking data in the log, then we must free it + * before releasing the buffer back to the free pool. + * + * If the fs has shutdown and we dropped the last reference, it may fall + * on us to release a (possibly dirty) bli if it never made it to the + * AIL (e.g., the aborted unpin already happened and didn't release it + * due to our reference). Since we're already shutdown and need xa_lock, + * just force remove from the AIL and release the bli here. */ - if (!xfs_buf_item_dirty(bip)) { + if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { + xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bp); + } else if (!xfs_buf_item_dirty(bip)) { /*** ASSERT(bp->b_pincount == 0); ***/ diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 9ead064b5e90..9b577beb43d7 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -96,7 +96,7 @@ xfs_trans_log_finish_rmap_update( struct xfs_trans *tp, struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type, - __uint64_t owner, + uint64_t owner, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, |