From 1c1266bb916e6a6b362d3be95f2cc7f3c41277a6 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 12 Jan 2011 16:53:27 -0800 Subject: ceph: fix getattr on directory when using norbytes The norbytes mount option was broken, and when doing getattr on a directory it return the rbytes instead of the number of entities. This commit fixes it. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e791fa34b23d..50001de66c69 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -701,10 +701,6 @@ static int fill_inode(struct inode *inode, ci->i_ceph_flags |= CEPH_I_COMPLETE; ci->i_max_offset = 2; } - - /* it may be better to set st_size in getattr instead? */ - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) - inode->i_size = ci->i_rbytes; break; default: pr_err("fill_inode %llx.%llx BAD mode 0%o\n", @@ -1805,7 +1801,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, else stat->dev = 0; if (S_ISDIR(inode->i_mode)) { - stat->size = ci->i_rbytes; + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), + RBYTES)) + stat->size = ci->i_rbytes; + else + stat->size = ci->i_files + ci->i_subdirs; stat->blocks = 0; stat->blksize = 65536; } -- cgit v1.2.1 From 17db143fc091238c43ab9f373974ca2224a4c3f8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jan 2011 15:27:29 -0800 Subject: ceph: fix xattr rbtree search Fix xattr name comparison in rbtree search for strings that share a prefix. The *name argument is null terminated, but the xattr name is not, so we need to use strncmp, but that means adjusting for the case where name is a prefix of xattr->name. The corresponding case in __set_xattr() already handles this properly (although in that case *name is also not null terminated). Reported-by: Sergiy Kibrik Signed-off-by: Sage Weil --- fs/ceph/xattr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 6e12a6ba5f79..8c9eba6ef9df 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_xattr *xattr = NULL; + int name_len = strlen(name); int c; p = &ci->i_xattrs.index.rb_node; @@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, parent = *p; xattr = rb_entry(parent, struct ceph_inode_xattr, node); c = strncmp(name, xattr->name, xattr->name_len); + if (c == 0 && name_len > xattr->name_len) + c = 1; if (c < 0) p = &(*p)->rb_left; else if (c > 0) -- cgit v1.2.1 From 50aac4fec503960380ab594a93a6fbfdf3f8915f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Jan 2011 07:59:40 -0800 Subject: ceph: fix cap_wanted_delay_{min,max} mount option initialization These were initialized to 0 instead of the default, fallout from the RBD refactor in 3d14c5d2b6e15c21d8e5467dc62d33127c23a644. Signed-off-by: Sage Weil --- fs/ceph/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bf6f0f34082a..9c5085465a63 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -290,6 +290,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; -- cgit v1.2.1 From 24be0c481067560b11441e794e27f166a3568863 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Jan 2011 08:48:06 -0800 Subject: ceph: fix erroneous cap flush to non-auth mds The int flushing is global and not clear on each iteration of the loop, which can cause a second flush of caps to any MDSs with ids greater than the auth. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 60d27bc9eb83..f654c7e933ac 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1658,6 +1658,8 @@ ack: if (cap == ci->i_auth_cap && ci->i_dirty_caps) flushing = __mark_caps_flushing(inode, session); + else + flushing = 0; mds = cap->mds; /* remember mds, so we don't repeat */ sent++; -- cgit v1.2.1 From 088b3f5e9ee2649f5cfc2f08d8ce654e3eeba310 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Jan 2011 08:56:01 -0800 Subject: ceph: fix flushing of caps vs cap import If we are mid-flush and a cap is migrated to another node, we need to resend the cap flush message to the new MDS, and do so with the original flush_seq to avoid leaking across a sync boundary. Previously we didn't redo the flush (we only flushed newly dirty data), which would cause a later sync to hang forever. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f654c7e933ac..7def3f5903dd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1560,9 +1560,10 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ revoking = cap->implemented & ~cap->issued; - if (revoking) - dout(" mds%d revoking %s\n", cap->mds, - ceph_cap_string(revoking)); + dout(" mds%d cap %p issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { @@ -1942,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, } } +static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&inode->i_lock); + cap = ci->i_auth_cap; + dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, + ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&inode->i_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&inode->i_lock); + } + } else { + spin_unlock(&inode->i_lock); + } +} + /* * Take references to capabilities we hold, so that we don't release @@ -2689,7 +2719,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, ceph_add_cap(inode, session, cap_id, -1, issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, NULL /* no caps context */); - try_flush_caps(inode, session, NULL); + kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); /* make sure we re-request max_size, if necessary */ -- cgit v1.2.1 From 7e57b81c7688c762bc9e775bc83f9fc17946f527 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Jan 2011 09:00:01 -0800 Subject: ceph: avoid immediate cap check after import The NODELAY flag avoids the heuristics that delay cap (issued/wanted) release. There's no reason for that after we import a cap, and it kills whatever benefit we get from those delays. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7def3f5903dd..6b61ded701e1 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2817,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, session, snaptrace, snaptrace_len); - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, - session); + ceph_check_caps(ceph_inode(inode), 0, session); goto done_unlocked; } -- cgit v1.2.1 From 0da2a4ac33c291728d8be5bdb865467dcb078d13 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 19 Jan 2011 14:18:50 -0500 Subject: NFS: fix handling of malloc failure during nfs_flush_multi() Cleanup of the allocated list entries should not call put_nfs_open_context() on each entry, as the context will always be NULL, causing an oops. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 10d648ea128b..c8278f4046cb 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -932,7 +932,7 @@ out_bad: while (!list_empty(&list)) { data = list_entry(list.next, struct nfs_write_data, pages); list_del(&data->pages); - nfs_writedata_release(data); + nfs_writedata_free(data); } nfs_redirty_request(req); return -ENOMEM; -- cgit v1.2.1 From 0ca7a5b9ac5d301845dd6382ff25a699b6263a81 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 21 Jan 2011 16:40:31 +0900 Subject: nilfs2: fix crash after one superblock became unavailable Fixes the following kernel oops in nilfs_setup_super() which could arise if one of two super-blocks is unavailable. > BUG: unable to handle kernel NULL pointer dereference at (null) > Pid: 3529, comm: mount.nilfs2 Not tainted 2.6.37 #1 / > EIP: 0060:[] EFLAGS: 00010202 CPU: 3 > EIP is at memcpy+0xc/0x1b > Call Trace: > [] ? nilfs_setup_super+0x6c/0xa5 [nilfs2] > [] ? nilfs_get_root_dentry+0x81/0xcb [nilfs2] > [] ? nilfs_mount+0x4f9/0x62c [nilfs2] > [] ? kstrdup+0x36/0x3f > [] ? nilfs_mount+0x0/0x62c [nilfs2] > [] ? vfs_kern_mount+0x4d/0x12c > [] ? get_fs_type+0x76/0x8f > [] ? do_kern_mount+0x33/0xbf > [] ? do_mount+0x2ed/0x714 > [] ? copy_mount_options+0x28/0xfc > [] ? sys_mount+0x72/0xaf > [] ? syscall_call+0x7/0xb Reported-by: Wakko Warner Signed-off-by: Ryusuke Konishi Tested-by: Wakko Warner Cc: stable [2.6.37, 2.6.36] LKML-Reference: <20110121024918.GA29598@animx.eu.org> --- fs/nilfs2/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 0994f6a76c07..58fd707174e1 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -704,7 +704,8 @@ skip_mount_setup: sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); /* synchronize sbp[1] with sbp[0] */ - memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + if (sbp[1]) + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); } -- cgit v1.2.1 From d66bbd441c08fe00ed2add1cf70cb243ebc2b27e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 21 Jan 2011 21:16:46 -0800 Subject: ceph: avoid picking MDS that is not active Ignore replication or auth frag data if it indicates an MDS that is not active. This can happen if the MDS shuts down and the client has stale data about the namespace distribution across the MDS cluster. If that's the case, fall back to directing the request based on the auth cap (which should always be accurate). Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 509339ceef72..a6949cc7c69a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -693,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, dout("choose_mds %p %llx.%llx " "frag %u mds%d (%d/%d)\n", inode, ceph_vinop(inode), - frag.frag, frag.mds, + frag.frag, mds, (int)r, frag.ndist); - return mds; + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; } /* since this file/dir wasn't known to be @@ -708,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, dout("choose_mds %p %llx.%llx " "frag %u mds%d (auth)\n", inode, ceph_vinop(inode), frag.frag, mds); - return mds; + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; } } } -- cgit v1.2.1 From 93c100c0b423266c0ee28497e90fdf27c05e6b8e Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 25 Jan 2011 19:28:43 +0000 Subject: [CIFS] Replace cifs md5 hashing functions with kernel crypto APIs Replace remaining use of md5 hash functions local to cifs module with kernel crypto APIs. Remove header and source file containing those local functions. Signed-off-by: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/Makefile | 2 +- fs/cifs/cifsencrypt.c | 1 - fs/cifs/link.c | 59 ++++++-- fs/cifs/md5.c | 366 -------------------------------------------------- fs/cifs/md5.h | 38 ------ fs/cifs/smbencrypt.c | 1 - 6 files changed, 51 insertions(+), 416 deletions(-) delete mode 100644 fs/cifs/md5.c delete mode 100644 fs/cifs/md5.h (limited to 'fs') diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 43b19dd39191..e1322296cb69 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ - md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ + md4.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o cifs-$(CONFIG_CIFS_ACL) += cifsacl.o diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 66f3d50d0676..35bf329c90e1 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -24,7 +24,6 @@ #include "cifspdu.h" #include "cifsglob.h" #include "cifs_debug.h" -#include "md5.h" #include "cifs_unicode.h" #include "cifsproto.h" #include "ntlmssp.h" diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 306769de2fb5..d3444ea6ac71 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -28,7 +28,6 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" -#include "md5.h" #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) @@ -46,6 +45,45 @@ md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\ md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15] +static int +symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) +{ + int rc; + unsigned int size; + struct crypto_shash *md5; + struct sdesc *sdescmd5; + + md5 = crypto_alloc_shash("md5", 0, 0); + if (!md5 || IS_ERR(md5)) { + rc = PTR_ERR(md5); + cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc); + return rc; + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); + sdescmd5 = kmalloc(size, GFP_KERNEL); + if (!sdescmd5) { + rc = -ENOMEM; + cERROR(1, "%s: Memory allocation failure\n", __func__); + goto symlink_hash_err; + } + sdescmd5->shash.tfm = md5; + sdescmd5->shash.flags = 0x0; + + rc = crypto_shash_init(&sdescmd5->shash); + if (rc) { + cERROR(1, "%s: Could not init md5 shash\n", __func__); + goto symlink_hash_err; + } + crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + +symlink_hash_err: + crypto_free_shash(md5); + kfree(sdescmd5); + + return rc; +} + static int CIFSParseMFSymlink(const u8 *buf, unsigned int buf_len, @@ -56,7 +94,6 @@ CIFSParseMFSymlink(const u8 *buf, unsigned int link_len; const char *md5_str1; const char *link_str; - struct MD5Context md5_ctx; u8 md5_hash[16]; char md5_str2[34]; @@ -70,9 +107,11 @@ CIFSParseMFSymlink(const u8 *buf, if (rc != 1) return -EINVAL; - cifs_MD5_init(&md5_ctx); - cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len); - cifs_MD5_final(md5_hash, &md5_ctx); + rc = symlink_hash(link_len, link_str, md5_hash); + if (rc) { + cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc); + return rc; + } snprintf(md5_str2, sizeof(md5_str2), CIFS_MF_SYMLINK_MD5_FORMAT, @@ -94,9 +133,9 @@ CIFSParseMFSymlink(const u8 *buf, static int CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) { + int rc; unsigned int link_len; unsigned int ofs; - struct MD5Context md5_ctx; u8 md5_hash[16]; if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE) @@ -107,9 +146,11 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) return -ENAMETOOLONG; - cifs_MD5_init(&md5_ctx); - cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len); - cifs_MD5_final(md5_hash, &md5_ctx); + rc = symlink_hash(link_len, link_str, md5_hash); + if (rc) { + cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc); + return rc; + } snprintf(buf, buf_len, CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c deleted file mode 100644 index 98b66a54c319..000000000000 --- a/fs/cifs/md5.c +++ /dev/null @@ -1,366 +0,0 @@ -/* - * This code implements the MD5 message-digest algorithm. - * The algorithm is due to Ron Rivest. This code was - * written by Colin Plumb in 1993, no copyright is claimed. - * This code is in the public domain; do with it what you wish. - * - * Equivalent code is available from RSA Data Security, Inc. - * This code has been tested against that, and is equivalent, - * except that you don't need to include two pages of legalese - * with every copy. - * - * To compute the message digest of a chunk of bytes, declare an - * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as - * needed on buffers full of bytes, and then call cifs_MD5_final, which - * will fill a supplied 16-byte array with the digest. - */ - -/* This code slightly modified to fit into Samba by - abartlet@samba.org Jun 2001 - and to fit the cifs vfs by - Steve French sfrench@us.ibm.com */ - -#include -#include "md5.h" - -static void MD5Transform(__u32 buf[4], __u32 const in[16]); - -/* - * Note: this code is harmless on little-endian machines. - */ -static void -byteReverse(unsigned char *buf, unsigned longs) -{ - __u32 t; - do { - t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 | - ((unsigned) buf[1] << 8 | buf[0]); - *(__u32 *) buf = t; - buf += 4; - } while (--longs); -} - -/* - * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious - * initialization constants. - */ -void -cifs_MD5_init(struct MD5Context *ctx) -{ - ctx->buf[0] = 0x67452301; - ctx->buf[1] = 0xefcdab89; - ctx->buf[2] = 0x98badcfe; - ctx->buf[3] = 0x10325476; - - ctx->bits[0] = 0; - ctx->bits[1] = 0; -} - -/* - * Update context to reflect the concatenation of another buffer full - * of bytes. - */ -void -cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len) -{ - register __u32 t; - - /* Update bitcount */ - - t = ctx->bits[0]; - if ((ctx->bits[0] = t + ((__u32) len << 3)) < t) - ctx->bits[1]++; /* Carry from low to high */ - ctx->bits[1] += len >> 29; - - t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */ - - /* Handle any leading odd-sized chunks */ - - if (t) { - unsigned char *p = (unsigned char *) ctx->in + t; - - t = 64 - t; - if (len < t) { - memmove(p, buf, len); - return; - } - memmove(p, buf, t); - byteReverse(ctx->in, 16); - MD5Transform(ctx->buf, (__u32 *) ctx->in); - buf += t; - len -= t; - } - /* Process data in 64-byte chunks */ - - while (len >= 64) { - memmove(ctx->in, buf, 64); - byteReverse(ctx->in, 16); - MD5Transform(ctx->buf, (__u32 *) ctx->in); - buf += 64; - len -= 64; - } - - /* Handle any remaining bytes of data. */ - - memmove(ctx->in, buf, len); -} - -/* - * Final wrapup - pad to 64-byte boundary with the bit pattern - * 1 0* (64-bit count of bits processed, MSB-first) - */ -void -cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx) -{ - unsigned int count; - unsigned char *p; - - /* Compute number of bytes mod 64 */ - count = (ctx->bits[0] >> 3) & 0x3F; - - /* Set the first char of padding to 0x80. This is safe since there is - always at least one byte free */ - p = ctx->in + count; - *p++ = 0x80; - - /* Bytes of padding needed to make 64 bytes */ - count = 64 - 1 - count; - - /* Pad out to 56 mod 64 */ - if (count < 8) { - /* Two lots of padding: Pad the first block to 64 bytes */ - memset(p, 0, count); - byteReverse(ctx->in, 16); - MD5Transform(ctx->buf, (__u32 *) ctx->in); - - /* Now fill the next block with 56 bytes */ - memset(ctx->in, 0, 56); - } else { - /* Pad block to 56 bytes */ - memset(p, 0, count - 8); - } - byteReverse(ctx->in, 14); - - /* Append length in bits and transform */ - ((__u32 *) ctx->in)[14] = ctx->bits[0]; - ((__u32 *) ctx->in)[15] = ctx->bits[1]; - - MD5Transform(ctx->buf, (__u32 *) ctx->in); - byteReverse((unsigned char *) ctx->buf, 4); - memmove(digest, ctx->buf, 16); - memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */ -} - -/* The four core functions - F1 is optimized somewhat */ - -/* #define F1(x, y, z) (x & y | ~x & z) */ -#define F1(x, y, z) (z ^ (x & (y ^ z))) -#define F2(x, y, z) F1(z, x, y) -#define F3(x, y, z) (x ^ y ^ z) -#define F4(x, y, z) (y ^ (x | ~z)) - -/* This is the central step in the MD5 algorithm. */ -#define MD5STEP(f, w, x, y, z, data, s) \ - (w += f(x, y, z) + data, w = w<>(32-s), w += x) - -/* - * The core of the MD5 algorithm, this alters an existing MD5 hash to - * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks - * the data and converts bytes into longwords for this routine. - */ -static void -MD5Transform(__u32 buf[4], __u32 const in[16]) -{ - register __u32 a, b, c, d; - - a = buf[0]; - b = buf[1]; - c = buf[2]; - d = buf[3]; - - MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); - MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); - MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); - MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); - MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); - MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); - MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); - MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); - MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); - MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); - MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); - MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); - MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); - MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); - MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); - MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); - - MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); - MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); - MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); - MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); - MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); - MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); - MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); - MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); - MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); - MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); - MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); - MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); - MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); - MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); - MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); - MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); - - MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); - MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); - MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); - MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); - MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); - MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); - MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); - MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); - MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); - MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); - MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); - MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); - MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); - MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); - MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); - MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); - - MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); - MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); - MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); - MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); - MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); - MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); - MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); - MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); - MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); - MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); - MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); - MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); - MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); - MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); - MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); - MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); - - buf[0] += a; - buf[1] += b; - buf[2] += c; - buf[3] += d; -} - -#if 0 /* currently unused */ -/*********************************************************************** - the rfc 2104 version of hmac_md5 initialisation. -***********************************************************************/ -static void -hmac_md5_init_rfc2104(unsigned char *key, int key_len, - struct HMACMD5Context *ctx) -{ - int i; - - /* if key is longer than 64 bytes reset it to key=MD5(key) */ - if (key_len > 64) { - unsigned char tk[16]; - struct MD5Context tctx; - - cifs_MD5_init(&tctx); - cifs_MD5_update(&tctx, key, key_len); - cifs_MD5_final(tk, &tctx); - - key = tk; - key_len = 16; - } - - /* start out by storing key in pads */ - memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad)); - memset(ctx->k_opad, 0, sizeof(ctx->k_opad)); - memcpy(ctx->k_ipad, key, key_len); - memcpy(ctx->k_opad, key, key_len); - - /* XOR key with ipad and opad values */ - for (i = 0; i < 64; i++) { - ctx->k_ipad[i] ^= 0x36; - ctx->k_opad[i] ^= 0x5c; - } - - cifs_MD5_init(&ctx->ctx); - cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64); -} -#endif - -/*********************************************************************** - the microsoft version of hmac_md5 initialisation. -***********************************************************************/ -void -hmac_md5_init_limK_to_64(const unsigned char *key, int key_len, - struct HMACMD5Context *ctx) -{ - int i; - - /* if key is longer than 64 bytes truncate it */ - if (key_len > 64) - key_len = 64; - - /* start out by storing key in pads */ - memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad)); - memset(ctx->k_opad, 0, sizeof(ctx->k_opad)); - memcpy(ctx->k_ipad, key, key_len); - memcpy(ctx->k_opad, key, key_len); - - /* XOR key with ipad and opad values */ - for (i = 0; i < 64; i++) { - ctx->k_ipad[i] ^= 0x36; - ctx->k_opad[i] ^= 0x5c; - } - - cifs_MD5_init(&ctx->ctx); - cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64); -} - -/*********************************************************************** - update hmac_md5 "inner" buffer -***********************************************************************/ -void -hmac_md5_update(const unsigned char *text, int text_len, - struct HMACMD5Context *ctx) -{ - cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */ -} - -/*********************************************************************** - finish off hmac_md5 "inner" buffer and generate outer one. -***********************************************************************/ -void -hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx) -{ - struct MD5Context ctx_o; - - cifs_MD5_final(digest, &ctx->ctx); - - cifs_MD5_init(&ctx_o); - cifs_MD5_update(&ctx_o, ctx->k_opad, 64); - cifs_MD5_update(&ctx_o, digest, 16); - cifs_MD5_final(digest, &ctx_o); -} - -/*********************************************************** - single function to calculate an HMAC MD5 digest from data. - use the microsoft hmacmd5 init method because the key is 16 bytes. -************************************************************/ -#if 0 /* currently unused */ -static void -hmac_md5(unsigned char key[16], unsigned char *data, int data_len, - unsigned char *digest) -{ - struct HMACMD5Context ctx; - hmac_md5_init_limK_to_64(key, 16, &ctx); - if (data_len != 0) - hmac_md5_update(data, data_len, &ctx); - - hmac_md5_final(digest, &ctx); -} -#endif diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h deleted file mode 100644 index 6fba8cb402fd..000000000000 --- a/fs/cifs/md5.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef MD5_H -#define MD5_H -#ifndef HEADER_MD5_H -/* Try to avoid clashes with OpenSSL */ -#define HEADER_MD5_H -#endif - -struct MD5Context { - __u32 buf[4]; - __u32 bits[2]; - unsigned char in[64]; -}; -#endif /* !MD5_H */ - -#ifndef _HMAC_MD5_H -struct HMACMD5Context { - struct MD5Context ctx; - unsigned char k_ipad[65]; - unsigned char k_opad[65]; -}; -#endif /* _HMAC_MD5_H */ - -void cifs_MD5_init(struct MD5Context *context); -void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf, - unsigned len); -void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context); - -/* The following definitions come from lib/hmacmd5.c */ - -/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len, - struct HMACMD5Context *ctx);*/ -void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len, - struct HMACMD5Context *ctx); -void hmac_md5_update(const unsigned char *text, int text_len, - struct HMACMD5Context *ctx); -void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx); -/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len, - unsigned char *digest);*/ diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 192ea51af20f..30135005e4f3 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -32,7 +32,6 @@ #include "cifs_unicode.h" #include "cifspdu.h" #include "cifsglob.h" -#include "md5.h" #include "cifs_debug.h" #include "cifsencrypt.h" -- cgit v1.2.1 From 72432ffcf555decbbae47f1be338e1d2f210aa69 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 24 Jan 2011 14:16:35 -0500 Subject: CIFS: Implement cifs_strict_writev (try #4) If we don't have Exclusive oplock we write a data to the server. Also set invalidate_mapping flag on the inode if we wrote something to the server. Add cifs_iovec_write to let the client write iovec buffers through CIFSSMBWrite2. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 15 ++-- fs/cifs/cifsfs.h | 4 +- fs/cifs/cifsproto.h | 2 + fs/cifs/file.c | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 217 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a8323f1dc1c4..f2970136d17d 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -600,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; ssize_t written; + int rc; written = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (!CIFS_I(inode)->clientCanCacheAll) - filemap_fdatawrite(inode->i_mapping); + + if (CIFS_I(inode)->clientCanCacheAll) + return written; + + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode); + return written; } @@ -737,7 +744,7 @@ const struct file_operations cifs_file_strict_ops = { .read = do_sync_read, .write = do_sync_write, .aio_read = cifs_strict_readv, - .aio_write = cifs_file_aio_write, + .aio_write = cifs_strict_writev, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -793,7 +800,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .read = do_sync_read, .write = do_sync_write, .aio_read = cifs_strict_readv, - .aio_write = cifs_file_aio_write, + .aio_write = cifs_strict_writev, .open = cifs_open, .release = cifs_close, .fsync = cifs_strict_fsync, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index f23206d46531..14789a97304e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -85,7 +85,9 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data, extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, - size_t write_size, loff_t *poffset); + size_t write_size, loff_t *poffset); +extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, int); extern int cifs_strict_fsync(struct file *, int); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 982895fa7615..35c989f4924f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -85,6 +85,8 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); +extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, + unsigned int bytes_written); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern unsigned int smbCalcSize(struct smb_hdr *ptr); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d7d65a70678e..0de17c1db608 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -848,7 +848,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) } /* update the file size (if needed) after a write */ -static void +void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written) { @@ -1619,6 +1619,206 @@ int cifs_flush(struct file *file, fl_owner_t id) return rc; } +static int +cifs_write_allocate_pages(struct page **pages, unsigned long num_pages) +{ + int rc = 0; + unsigned long i; + + for (i = 0; i < num_pages; i++) { + pages[i] = alloc_page(__GFP_HIGHMEM); + if (!pages[i]) { + /* + * save number of pages we have already allocated and + * return with ENOMEM error + */ + num_pages = i; + rc = -ENOMEM; + goto error; + } + } + + return rc; + +error: + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + return rc; +} + +static inline +size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) +{ + size_t num_pages; + size_t clen; + + clen = min_t(const size_t, len, wsize); + num_pages = clen / PAGE_CACHE_SIZE; + if (clen % PAGE_CACHE_SIZE) + num_pages++; + + if (cur_len) + *cur_len = clen; + + return num_pages; +} + +static ssize_t +cifs_iovec_write(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *poffset) +{ + size_t total_written = 0, written = 0; + unsigned long num_pages, npages; + size_t copied, len, cur_len, i; + struct kvec *to_send; + struct page **pages; + struct iov_iter it; + struct inode *inode; + struct cifsFileInfo *open_file; + struct cifsTconInfo *pTcon; + struct cifs_sb_info *cifs_sb; + int xid, rc; + + len = iov_length(iov, nr_segs); + if (!len) + return 0; + + rc = generic_write_checks(file, poffset, &len, 0); + if (rc) + return rc; + + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + num_pages = get_numpages(cifs_sb->wsize, len, &cur_len); + + pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL); + if (!to_send) { + kfree(pages); + return -ENOMEM; + } + + rc = cifs_write_allocate_pages(pages, num_pages); + if (rc) { + kfree(pages); + kfree(to_send); + return rc; + } + + xid = GetXid(); + open_file = file->private_data; + pTcon = tlink_tcon(open_file->tlink); + inode = file->f_path.dentry->d_inode; + + iov_iter_init(&it, iov, nr_segs, len, 0); + npages = num_pages; + + do { + size_t save_len = cur_len; + for (i = 0; i < npages; i++) { + copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE); + copied = iov_iter_copy_from_user(pages[i], &it, 0, + copied); + cur_len -= copied; + iov_iter_advance(&it, copied); + to_send[i+1].iov_base = kmap(pages[i]); + to_send[i+1].iov_len = copied; + } + + cur_len = save_len - cur_len; + + do { + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, false); + if (rc != 0) + break; + } + rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid, + cur_len, *poffset, &written, + to_send, npages, 0); + } while (rc == -EAGAIN); + + for (i = 0; i < npages; i++) + kunmap(pages[i]); + + if (written) { + len -= written; + total_written += written; + cifs_update_eof(CIFS_I(inode), *poffset, written); + *poffset += written; + } else if (rc < 0) { + if (!total_written) + total_written = rc; + break; + } + + /* get length and number of kvecs of the next write */ + npages = get_numpages(cifs_sb->wsize, len, &cur_len); + } while (len > 0); + + if (total_written > 0) { + spin_lock(&inode->i_lock); + if (*poffset > inode->i_size) + i_size_write(inode, *poffset); + spin_unlock(&inode->i_lock); + } + + cifs_stats_bytes_written(pTcon, total_written); + mark_inode_dirty_sync(inode); + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + kfree(to_send); + kfree(pages); + FreeXid(xid); + return total_written; +} + +static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t written; + struct inode *inode; + + inode = iocb->ki_filp->f_path.dentry->d_inode; + + /* + * BB - optimize the way when signing is disabled. We can drop this + * extra memory-to-memory copying and use iovec buffers for constructing + * write request. + */ + + written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos); + if (written > 0) { + CIFS_I(inode)->invalid_mapping = true; + iocb->ki_pos = pos; + } + + return written; +} + +ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode; + + inode = iocb->ki_filp->f_path.dentry->d_inode; + + if (CIFS_I(inode)->clientCanCacheAll) + return generic_file_aio_write(iocb, iov, nr_segs, pos); + + /* + * In strict cache mode we need to write the data to the server exactly + * from the pos to pos+len-1 rather than flush all affected pages + * because it may cause a error with mandatory locks on these pages but + * not on the region from pos to ppos+len-1. + */ + + return cifs_user_writev(iocb, iov, nr_segs, pos); +} + static ssize_t cifs_iovec_read(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *poffset) -- cgit v1.2.1 From d39454ffe4a3c85428483b8a8a8e5e797b6363d5 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 24 Jan 2011 14:16:35 -0500 Subject: CIFS: Add strictcache mount option Use for switching on strict cache mode. In this mode the client reads from the cache all the time it has Oplock Level II, otherwise - read from the server. As for write - the client stores a data in the cache in Exclusive Oplock case, otherwise - write directly to the server. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/README | 5 +++++ fs/cifs/connect.c | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/cifs/README b/fs/cifs/README index 46af99ab3614..fe1683590828 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -452,6 +452,11 @@ A partial list of the supported mount options follows: if oplock (caching token) is granted and held. Note that direct allows write operations larger than page size to be sent to the server. + strictcache Use for switching on strict cache mode. In this mode the + client read from the cache all the time it has Oplock Level II, + otherwise - read from the server. All written data are stored + in the cache, but if the client doesn't have Exclusive Oplock, + it writes the data to the server. acl Allow setfacl and getfacl to manage posix ACLs if server supports them. (default) noacl Do not allow setfacl and getfacl calls on this mount diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0cc3b81c2e84..47034af67b09 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -87,6 +87,7 @@ struct smb_vol { bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ bool server_ino:1; /* use inode numbers from server ie UniqueId */ bool direct_io:1; + bool strict_io:1; /* strict cache behavior */ bool remap:1; /* set to remap seven reserved chars in filenames */ bool posix_paths:1; /* unset to not ask for posix pathnames. */ bool no_linux_ext:1; @@ -1344,6 +1345,8 @@ cifs_parse_mount_options(char *options, const char *devname, vol->direct_io = 1; } else if (strnicmp(data, "forcedirectio", 13) == 0) { vol->direct_io = 1; + } else if (strnicmp(data, "strictcache", 11) == 0) { + vol->strict_io = 1; } else if (strnicmp(data, "noac", 4) == 0) { printk(KERN_WARNING "CIFS: Mount option noac not " "supported. Instead set " @@ -2584,6 +2587,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, if (pvolume_info->multiuser) cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_NO_PERM); + if (pvolume_info->strict_io) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; if (pvolume_info->direct_io) { cFYI(1, "mounting share using direct i/o"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; -- cgit v1.2.1 From ad3d2eedf0ed3611f5f86b9e4d0d15cc76c63465 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 17 Jan 2011 18:41:50 +0000 Subject: NFS4: Avoid potential NULL pointer dereference in decode_and_add_ds(). On Mon, 17 Jan 2011, Mi Jinlong wrote: > > > Jesper Juhl: > > strrchr() can return NULL if nothing is found. If this happens we'll > > dereference a NULL pointer in > > fs/nfs/nfs4filelayoutdev.c::decode_and_add_ds(). > > > > I tried to find some other code that guarantees that this can never > > happen but I was unsuccessful. So, unless someone else can point to some > > code that ensures this can never be a problem, I believe this patch is > > needed. > > > > While I was changing this code I also noticed that all the dprintk() > > statements, except one, start with "%s:". The one missing the ":" I added > > it to. > > Maybe another one also should be changed at decode_and_add_ds() at line 243: > > 243 printk("%s Decoded address and port %s\n", __func__, buf); > Missed that one. Thanks. Signed-off-by: Jesper Juhl Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayoutdev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 51fe64ace55a..f5c9b125e8cc 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -214,7 +214,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) /* ipv6 length plus port is legal */ if (rlen > INET6_ADDRSTRLEN + 8) { - dprintk("%s Invalid address, length %d\n", __func__, + dprintk("%s: Invalid address, length %d\n", __func__, rlen); goto out_err; } @@ -225,6 +225,11 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) /* replace the port dots with dashes for the in4_pton() delimiter*/ for (i = 0; i < 2; i++) { char *res = strrchr(buf, '.'); + if (!res) { + dprintk("%s: Failed finding expected dots in port\n", + __func__); + goto out_free; + } *res = '-'; } @@ -240,7 +245,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) port = htons((tmp[0] << 8) | (tmp[1])); ds = nfs4_pnfs_ds_add(inode, ip_addr, port); - dprintk("%s Decoded address and port %s\n", __func__, buf); + dprintk("%s: Decoded address and port %s\n", __func__, buf); out_free: kfree(buf); out_err: -- cgit v1.2.1 From 839f7ad6932d95f4d5ae7267b95c574714ff3d5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Jan 2011 15:54:57 +0000 Subject: NFS: Fix "kernel BUG at fs/aio.c:554!" Nick Piggin reports: > I'm getting use after frees in aio code in NFS > > [ 2703.396766] Call Trace: > [ 2703.396858] [] ? native_sched_clock+0x27/0x80 > [ 2703.396959] [] ? put_lock_stats+0xe/0x40 > [ 2703.397058] [] ? lock_release_holdtime+0xa8/0x140 > [ 2703.397159] [] lock_acquire+0x95/0x1b0 > [ 2703.397260] [] ? aio_put_req+0x2b/0x60 > [ 2703.397361] [] ? get_parent_ip+0x11/0x50 > [ 2703.397464] [] _raw_spin_lock_irq+0x41/0x80 > [ 2703.397564] [] ? aio_put_req+0x2b/0x60 > [ 2703.397662] [] aio_put_req+0x2b/0x60 > [ 2703.397761] [] do_io_submit+0x2be/0x7c0 > [ 2703.397895] [] sys_io_submit+0xb/0x10 > [ 2703.397995] [] system_call_fastpath+0x16/0x1b > > Adding some tracing, it is due to nfs completing the request then > returning something other than -EIOCBQUEUED, so aio.c > also completes the request. To address this, prevent the NFS direct I/O engine from completing async iocbs when the forward path returns an error without starting any I/O. This fix appears to survive ^C during both "xfstest no. 208" and "fsx -Z." It's likely this bug has existed for a very long while, as we are seeing very similar symptoms in OEL 5. Copying stable. Cc: Stable Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e6ace0d93c71..9943a75bb6d1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, pos += vec->iov_len; } + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + if (put_dreq(dreq)) nfs_direct_complete(dreq); - - if (requested_bytes != 0) - return 0; - - if (result < 0) - return result; - return -EIO; + return 0; } static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, @@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, pos += vec->iov_len; } + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + if (put_dreq(dreq)) nfs_direct_write_complete(dreq, dreq->inode); - - if (requested_bytes != 0) - return 0; - - if (result < 0) - return result; - return -EIO; + return 0; } static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, -- cgit v1.2.1 From ee5dc7732bd557bae6d10873a0aac606d2c551fb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Jan 2011 03:05:18 +0000 Subject: NFS: Fix "kernel BUG at fs/nfs/nfs3xdr.c:1338!" Milan Broz reports: > on today Linus' tree I get OOps if using nfs. > > server (2.6.36) exports dir: > /dir 172.16.1.0/24(rw,async,all_squash,no_subtree_check,anonuid=500,anongid=500) > > on client it is mounted in fstab > server:/dir /mnt/tst nfs rw,soft 0 0 > > and these commands OOpses it (simplified from a configure script): > > cd /dir > touch x > install x y > > [ 105.327701] ------------[ cut here ]------------ > [ 105.327979] kernel BUG at fs/nfs/nfs3xdr.c:1338! > [ 105.328075] invalid opcode: 0000 [#1] PREEMPT SMP > [ 105.328223] last sysfs file: /sys/devices/virtual/bdi/0:16/uevent > [ 105.328349] Modules linked in: usbcore dm_mod > [ 105.328553] > [ 105.328678] Pid: 3710, comm: install Not tainted 2.6.37+ #423 440BX Desktop Reference Platform/VMware Virtual Platform > [ 105.328853] EIP: 0060:[] EFLAGS: 00010282 CPU: 0 > [ 105.329152] EIP is at nfs3_xdr_enc_setacl3args+0x61/0x98 > [ 105.329249] EAX: ffffffea EBX: ce941d98 ECX: 00000000 EDX: 00000004 > [ 105.329340] ESI: ce941cd0 EDI: 000000a4 EBP: ce941cc0 ESP: ce941cb4 > [ 105.329431] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 > [ 105.329525] Process install (pid: 3710, ti=ce940000 task=ced36f20 task.ti=ce940000) > [ 105.336600] Stack: > [ 105.336693] ce941cd0 ce9dc000 00000000 ce941cf8 c12ecd02 c12f43e0 c116c00b cf754158 > [ 105.336982] ce9dc004 cf754284 ce9dc004 cf7ffee8 ceff9978 ce9dc000 cf7ffee8 ce9dc000 > [ 105.337182] ce9dc000 ce941d14 c12e698d cf75412c ce941d98 cf7ffee8 cf7fff20 00000000 > [ 105.337405] Call Trace: > [ 105.337695] [] rpcauth_wrap_req+0x75/0x7f > [ 105.337806] [] ? xdr_encode_opaque+0x12/0x15 > [ 105.337898] [] ? nfs3_xdr_enc_setacl3args+0x0/0x98 > [ 105.337988] [] call_transmit+0x17e/0x1e8 > [ 105.338072] [] __rpc_execute+0x6d/0x1a6 > [ 105.338155] [] rpc_execute+0x34/0x37 > [ 105.338235] [] rpc_run_task+0xb5/0xbd > [ 105.338316] [] rpc_call_sync+0x3d/0x58 > [ 105.338402] [] nfs3_proc_setacls+0x18e/0x24f > [ 105.338493] [] ? __kmalloc+0x148/0x1c4 > [ 105.338579] [] ? posix_acl_alloc+0x12/0x22 > [ 105.338665] [] nfs3_proc_setacl+0xa0/0xca > [ 105.338748] [] nfs3_setxattr+0x62/0x88 > [ 105.338834] [] ? sub_preempt_count+0x7c/0x89 > [ 105.338926] [] ? nfs3_setxattr+0x0/0x88 > [ 105.339026] [] __vfs_setxattr_noperm+0x26/0x95 > [ 105.339114] [] vfs_setxattr+0x5b/0x76 > [ 105.339211] [] setxattr+0x9d/0xc3 > [ 105.339298] [] ? handle_pte_fault+0x258/0x5cb > [ 105.339428] [] ? __free_pages+0x1a/0x23 > [ 105.339517] [] ? up_read+0x16/0x2c > [ 105.339599] [] ? fget+0x0/0xa3 > [ 105.339677] [] ? fget+0x0/0xa3 > [ 105.339760] [] ? get_parent_ip+0xb/0x31 > [ 105.339843] [] ? sub_preempt_count+0x7c/0x89 > [ 105.339931] [] sys_fsetxattr+0x51/0x79 > [ 105.340014] [] sysenter_do_call+0x12/0x32 > [ 105.340133] Code: 2e 76 18 00 58 31 d2 8b 7f 28 f6 43 04 01 74 03 8b 53 08 6a 00 8b 46 04 6a 01 8b 0b 52 89 fa e8 85 10 f8 ff 83 c4 0c 85 c0 79 04 <0f> 0b eb fe 31 c9 f6 43 04 04 74 03 8b 4b 0c 68 00 10 00 00 8d > [ 105.350321] EIP: [] nfs3_xdr_enc_setacl3args+0x61/0x98 SS:ESP 0068:ce941cb4 > [ 105.364385] ---[ end trace 01fcfe7f0f7f6e4a ]--- nfs3_xdr_enc_setacl3args() is not properly setting up the target buffer before nfsacl_encode() attempts to encode the ACL. Introduced by commit d9c407b1 "NFS: Introduce new-style XDR encoding functions for NFSv3." Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs3xdr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 01c5e8b1941d..183c6b123d0f 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1328,10 +1328,13 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, encode_nfs_fh3(xdr, NFS_FH(args->inode)); encode_uint32(xdr, args->mask); + + base = req->rq_slen; if (args->npages != 0) xdr_write_pages(xdr, args->pages, 0, args->len); + else + xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); - base = req->rq_slen; error = nfsacl_encode(xdr->buf, base, args->inode, (args->mask & NFS_ACL) ? args->acl_access : NULL, 1, 0); -- cgit v1.2.1 From 731f3f482ad3b2c58a1af2d0a9a634a82803706a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Jan 2011 03:05:28 +0000 Subject: NFS: nfsacl_{encode,decode} should return signed integer Clean up. The nfsacl_encode() and nfsacl_decode() functions return negative errno values, and each call site verifies that the returned value is not negative. Change the synopsis of both of these functions to reflect this usage. Document the synopsis and return values. Reported-by: Trond Myklebust Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs_common/nfsacl.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index fc1c52571c03..a3e78bd18679 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -72,9 +72,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) return 0; } -unsigned int -nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, - struct posix_acl *acl, int encode_entries, int typeflag) +/** + * nfsacl_encode - Encode an NFSv3 ACL + * + * @buf: destination xdr_buf to contain XDR encoded ACL + * @base: byte offset in xdr_buf where XDR'd ACL begins + * @inode: inode of file whose ACL this is + * @acl: posix_acl to encode + * @encode_entries: whether to encode ACEs as well + * @typeflag: ACL type: NFS_ACL_DEFAULT or zero + * + * Returns size of encoded ACL in bytes or a negative errno value. + */ +int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, + struct posix_acl *acl, int encode_entries, int typeflag) { int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; struct nfsacl_encode_desc nfsacl_desc = { @@ -224,9 +235,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl) return 0; } -unsigned int -nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, - struct posix_acl **pacl) +/** + * nfsacl_decode - Decode an NFSv3 ACL + * + * @buf: xdr_buf containing XDR'd ACL data to decode + * @base: byte offset in xdr_buf where XDR'd ACL begins + * @aclcnt: count of ACEs in decoded posix_acl + * @pacl: buffer in which to place decoded posix_acl + * + * Returns the length of the decoded ACL in bytes, or a negative errno value. + */ +int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, + struct posix_acl **pacl) { struct nfsacl_decode_desc nfsacl_desc = { .desc = { -- cgit v1.2.1 From f61f6da0d53842e849bab7f69e1431bd3de1136d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Jan 2011 03:05:38 +0000 Subject: NFS: Prevent memory allocation failure in nfsacl_encode() nfsacl_encode() allocates memory in certain cases. This of course is not guaranteed to work. Since commit 9f06c719 "SUNRPC: New xdr_streams XDR encoder API", the kernel's XDR encoders can't return a result indicating possibly a failure, so a memory allocation failure in nfsacl_encode() has become fatal (ie, the XDR code Oopses) in some cases. However, the allocated memory is a tiny fixed amount, on the order of 40-50 bytes. We can easily use a stack-allocated buffer for this, with only a wee bit of nose-holding. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 4 ++-- fs/nfs_common/nfsacl.c | 22 +++++++++++++++------- fs/posix_acl.c | 17 +++++++++++++---- 3 files changed, 30 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9f88c5f4c7e2..274342771655 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, if (!nfs_server_capable(inode, NFS_CAP_ACLS)) goto out; - /* We are doing this here, because XDR marshalling can only - return -ENOMEM. */ + /* We are doing this here because XDR marshalling does not + * return any results, it BUGs. */ status = -ENOSPC; if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) goto out; diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index a3e78bd18679..84c27d69d421 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -42,6 +42,11 @@ struct nfsacl_encode_desc { gid_t gid; }; +struct nfsacl_simple_acl { + struct posix_acl acl; + struct posix_acl_entry ace[4]; +}; + static int xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) { @@ -99,17 +104,22 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, .uid = inode->i_uid, .gid = inode->i_gid, }; + struct nfsacl_simple_acl aclbuf; int err; - struct posix_acl *acl2 = NULL; if (entries > NFS_ACL_MAX_ENTRIES || xdr_encode_word(buf, base, entries)) return -EINVAL; if (encode_entries && acl && acl->a_count == 3) { - /* Fake up an ACL_MASK entry. */ - acl2 = posix_acl_alloc(4, GFP_KERNEL); - if (!acl2) - return -ENOMEM; + struct posix_acl *acl2 = &aclbuf.acl; + + /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is + * invoked in contexts where a memory allocation failure is + * fatal. Fortunately this fake ACL is small enough to + * construct on the stack. */ + memset(acl2, 0, sizeof(acl2)); + posix_acl_init(acl2, 4); + /* Insert entries in canonical order: other orders seem to confuse Solaris VxFS. */ acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ @@ -120,8 +130,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, nfsacl_desc.acl = acl2; } err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc); - if (acl2) - posix_acl_release(acl2); if (!err) err = 8 + nfsacl_desc.desc.elem_size * nfsacl_desc.desc.array_len; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 39df95a0ec25..b1cf6bf4b41d 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -22,6 +22,7 @@ #include +EXPORT_SYMBOL(posix_acl_init); EXPORT_SYMBOL(posix_acl_alloc); EXPORT_SYMBOL(posix_acl_clone); EXPORT_SYMBOL(posix_acl_valid); @@ -31,6 +32,16 @@ EXPORT_SYMBOL(posix_acl_create_masq); EXPORT_SYMBOL(posix_acl_chmod_masq); EXPORT_SYMBOL(posix_acl_permission); +/* + * Init a fresh posix_acl + */ +void +posix_acl_init(struct posix_acl *acl, int count) +{ + atomic_set(&acl->a_refcount, 1); + acl->a_count = count; +} + /* * Allocate a new ACL with the specified number of entries. */ @@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags) const size_t size = sizeof(struct posix_acl) + count * sizeof(struct posix_acl_entry); struct posix_acl *acl = kmalloc(size, flags); - if (acl) { - atomic_set(&acl->a_refcount, 1); - acl->a_count = count; - } + if (acl) + posix_acl_init(acl, count); return acl; } -- cgit v1.2.1 From 80c30e8de4f81851b1f712bcc596e11d53bc76f1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 24 Jan 2011 20:50:26 +0000 Subject: NLM: Fix "kernel BUG at fs/lockd/host.c:417!" or ".../host.c:283!" Nick Bowler reports: > We were just having some NFS server troubles, and my client machine > running 2.6.38-rc1+ (specifically, commit 2b1caf6ed7b888c95) crashed > hard (syslog output appended to this mail). > > I'm not sure what the exact timeline was or how to reproduce this, > but the server was rebooted during all this. Since I've never seen > this happen before, it is possibly a regression from previous kernel > releases. However, I recently updated my nfs-utils (on the client) to > version 1.2.3, so that might be related as well. [ BUG output redacted ] When done searching, the for_each_host loop in next_host_state() falls through and returns the final host on the host chain without bumping it's reference count. Since the host's ref count is only one at that point, releasing the host in nlm_host_rebooted() attempts to destroy the host prematurely, and therefore hits a BUG(). Likely, the original intent of the for_each_host behavior in next_host_state() was to handle the case when the host chain is empty. Searching the chain and finding no suitable host to return needs to be handled as well. Defensively restructure next_host_state() always to return NULL when the loop falls through. Introduced by commit b10e30f6 "lockd: reorganize nlm_host_rebooted". Cc: J. Bruce Fields Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 5f1bcb2f06f3..b7c99bfb3da6 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -520,7 +520,7 @@ static struct nlm_host *next_host_state(struct hlist_head *cache, struct nsm_handle *nsm, const struct nlm_reboot *info) { - struct nlm_host *host = NULL; + struct nlm_host *host; struct hlist_head *chain; struct hlist_node *pos; @@ -532,12 +532,13 @@ static struct nlm_host *next_host_state(struct hlist_head *cache, host->h_state++; nlm_get_host(host); - goto out; + mutex_unlock(&nlm_host_mutex); + return host; } } -out: + mutex_unlock(&nlm_host_mutex); - return host; + return NULL; } /** -- cgit v1.2.1 From 778be232a207e79088ba70d832ac25dfea6fbf1a Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 25 Jan 2011 15:38:01 +0000 Subject: NFS do not find client in NFSv4 pg_authenticate The information required to find the nfs_client cooresponding to the incoming back channel request is contained in the NFS layer. Perform minimal checking in the RPC layer pg_authenticate method, and push more detailed checking into the NFS layer where the nfs_client can be found. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 109 +++++++++++++------------------------------------ fs/nfs/callback.h | 4 +- fs/nfs/callback_proc.c | 10 +---- fs/nfs/callback_xdr.c | 5 +-- fs/nfs/client.c | 15 +++---- fs/nfs/internal.h | 3 +- fs/nfs/nfs4state.c | 6 --- 7 files changed, 41 insertions(+), 111 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 199016528fcb..e3d294269058 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -134,33 +134,6 @@ out_err: } #if defined(CONFIG_NFS_V4_1) -/* - * * CB_SEQUENCE operations will fail until the callback sessionid is set. - * */ -int nfs4_set_callback_sessionid(struct nfs_client *clp) -{ - struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv; - struct nfs4_sessionid *bc_sid; - - if (!serv->sv_bc_xprt) - return -EINVAL; - - /* on success freed in xprt_free */ - bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL); - if (!bc_sid) - return -ENOMEM; - memcpy(bc_sid->data, &clp->cl_session->sess_id.data, - NFS4_MAX_SESSIONID_LEN); - spin_lock_bh(&serv->sv_cb_lock); - serv->sv_bc_xprt->xpt_bc_sid = bc_sid; - spin_unlock_bh(&serv->sv_cb_lock); - dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__, - ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1], - ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3], - serv->sv_bc_xprt); - return 0; -} - /* * The callback service for NFSv4.1 callbacks */ @@ -266,10 +239,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct nfs_callback_data *cb_info) { } -int nfs4_set_callback_sessionid(struct nfs_client *clp) -{ - return 0; -} #endif /* CONFIG_NFS_V4_1 */ /* @@ -359,78 +328,58 @@ void nfs_callback_down(int minorversion) mutex_unlock(&nfs_callback_mutex); } -static int check_gss_callback_principal(struct nfs_client *clp, - struct svc_rqst *rqstp) +/* Boolean check of RPC_AUTH_GSS principal */ +int +check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) { struct rpc_clnt *r = clp->cl_rpcclient; char *p = svc_gss_principal(rqstp); + if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) + return 1; + /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ if (clp->cl_minorversion != 0) - return SVC_DROP; + return 0; /* * It might just be a normal user principal, in which case * userspace won't bother to tell us the name at all. */ if (p == NULL) - return SVC_DENIED; + return 0; /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ if (memcmp(p, "nfs@", 4) != 0) - return SVC_DENIED; + return 0; p += 4; if (strcmp(p, r->cl_server) != 0) - return SVC_DENIED; - return SVC_OK; + return 0; + return 1; } -/* pg_authenticate method helper */ -static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp) -{ - struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp); - int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0; - - dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc); - if (svc_is_backchannel(rqstp)) - /* Sessionid (usually) set after CB_NULL ping */ - return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid, - is_cb_compound); - else - /* No callback identifier in pg_authenticate */ - return nfs4_find_client_no_ident(svc_addr(rqstp)); -} - -/* pg_authenticate method for nfsv4 callback threads. */ +/* + * pg_authenticate method for nfsv4 callback threads. + * + * The authflavor has been negotiated, so an incorrect flavor is a server + * bug. Drop packets with incorrect authflavor. + * + * All other checking done after NFS decoding where the nfs_client can be + * found in nfs4_callback_compound + */ static int nfs_callback_authenticate(struct svc_rqst *rqstp) { - struct nfs_client *clp; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - int ret = SVC_OK; - - /* Don't talk to strangers */ - clp = nfs_cb_find_client(rqstp); - if (clp == NULL) - return SVC_DROP; - - dprintk("%s: %s NFSv4 callback!\n", __func__, - svc_print_addr(rqstp, buf, sizeof(buf))); - switch (rqstp->rq_authop->flavour) { - case RPC_AUTH_NULL: - if (rqstp->rq_proc != CB_NULL) - ret = SVC_DENIED; - break; - case RPC_AUTH_UNIX: - break; - case RPC_AUTH_GSS: - ret = check_gss_callback_principal(clp, rqstp); - break; - default: - ret = SVC_DENIED; + case RPC_AUTH_NULL: + if (rqstp->rq_proc != CB_NULL) + return SVC_DROP; + break; + case RPC_AUTH_GSS: + /* No RPC_AUTH_GSS support yet in NFSv4.1 */ + if (svc_is_backchannel(rqstp)) + return SVC_DROP; } - nfs_put_client(clp); - return ret; + return SVC_OK; } /* diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index d3b44f9bd747..46d93ce7311b 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -7,6 +7,7 @@ */ #ifndef __LINUX_FS_NFS_CALLBACK_H #define __LINUX_FS_NFS_CALLBACK_H +#include #define NFS4_CALLBACK 0x40000000 #define NFS4_CALLBACK_XDRSIZE 2048 @@ -37,7 +38,6 @@ enum nfs4_callback_opnum { struct cb_process_state { __be32 drc_status; struct nfs_client *clp; - struct nfs4_sessionid *svc_sid; /* v4.1 callback service sessionid */ }; struct cb_compound_hdr_arg { @@ -168,7 +168,7 @@ extern unsigned nfs4_callback_layoutrecall( extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); extern void nfs4_cb_take_slot(struct nfs_client *clp); #endif /* CONFIG_NFS_V4_1 */ - +extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res, struct cb_process_state *cps); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 4bb91cb2620d..829f406e91dd 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -373,17 +373,11 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, { struct nfs_client *clp; int i; - __be32 status; + __be32 status = htonl(NFS4ERR_BADSESSION); cps->clp = NULL; - status = htonl(NFS4ERR_BADSESSION); - /* Incoming session must match the callback session */ - if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN)) - goto out; - - clp = nfs4_find_client_sessionid(args->csa_addr, - &args->csa_sessionid, 1); + clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); if (clp == NULL) goto out; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 23112c263f81..14e0f9371d14 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -794,10 +794,9 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r if (hdr_arg.minorversion == 0) { cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); - if (!cps.clp) + if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) return rpc_drop_reply; - } else - cps.svc_sid = bc_xprt_sid(rqstp); + } hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 192f2f860265..bd3ca32879e7 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1206,16 +1206,11 @@ nfs4_find_client_ident(int cb_ident) * For CB_COMPOUND calls, find a client by IP address, protocol version, * minorversion, and sessionID * - * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service - * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL - * can arrive before the callback sessionid is set. For CB_NULL calls, - * find a client by IP address protocol version, and minorversion. - * * Returns NULL if no such client */ struct nfs_client * nfs4_find_client_sessionid(const struct sockaddr *addr, - struct nfs4_sessionid *sid, int is_cb_compound) + struct nfs4_sessionid *sid) { struct nfs_client *clp; @@ -1227,9 +1222,9 @@ nfs4_find_client_sessionid(const struct sockaddr *addr, if (!nfs4_has_session(clp)) continue; - /* Match sessionid unless cb_null call*/ - if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data, - sid->data, NFS4_MAX_SESSIONID_LEN) != 0)) + /* Match sessionid*/ + if (memcmp(clp->cl_session->sess_id.data, + sid->data, NFS4_MAX_SESSIONID_LEN) != 0) continue; atomic_inc(&clp->cl_count); @@ -1244,7 +1239,7 @@ nfs4_find_client_sessionid(const struct sockaddr *addr, struct nfs_client * nfs4_find_client_sessionid(const struct sockaddr *addr, - struct nfs4_sessionid *sid, int is_cb_compound) + struct nfs4_sessionid *sid) { return NULL; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4644f04b4b46..cf9fdbdabc67 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -133,8 +133,7 @@ extern void nfs_put_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); extern struct nfs_client *nfs4_find_client_ident(int); extern struct nfs_client * -nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *, - int); +nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *); extern struct nfs_server *nfs_create_server( const struct nfs_parsed_mount_data *, struct nfs_fh *); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2336d532cf66..e6742b57a04c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -232,12 +232,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) status = nfs4_proc_create_session(clp); if (status != 0) goto out; - status = nfs4_set_callback_sessionid(clp); - if (status != 0) { - printk(KERN_WARNING "Sessionid not set. No callback service\n"); - nfs_callback_down(1); - status = 0; - } nfs41_setup_state_renewal(clp); nfs_mark_client_ready(clp, NFS_CS_READY); out: -- cgit v1.2.1 From 2c4cdf8f6d3cfb48036400952329555099c8c92c Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 25 Jan 2011 15:38:02 +0000 Subject: NFS fix cb_sequence error processing Always assign the cb_process_state nfs_client pointer so a processing error in cb_sequence after the nfs_client is found and referenced returns a non-NULL cb_process_state nfs_client and the matching nfs_put_client in nfs4_callback_compound dereferences the client. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 829f406e91dd..89587573fe50 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -408,9 +408,9 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; nfs4_cb_take_slot(clp); - cps->clp = clp; /* put in nfs4_callback_compound */ out: + cps->clp = clp; /* put in nfs4_callback_compound */ for (i = 0; i < args->csa_nrclists; i++) kfree(args->csa_rclists[i].rcl_refcalls); kfree(args->csa_rclists); -- cgit v1.2.1 From b2a2897dc4a59684321de425652061c62a0569d0 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 25 Jan 2011 15:38:03 +0000 Subject: NFS improve pnfs_put_deviceid_cache debug print What we really want to know is the ref count. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bc4089769735..1b1bc1a0fb0a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -951,7 +951,7 @@ pnfs_put_deviceid_cache(struct nfs_client *clp) { struct pnfs_deviceid_cache *local = clp->cl_devid_cache; - dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); + dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref)); if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { int i; /* Verify cache is empty */ -- cgit v1.2.1 From 27dc1cd3ad9300f81e1219e5fc305d91d85353f8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 25 Jan 2011 15:28:21 -0500 Subject: NFS: nfs_wcc_update_inode() should set nfsi->attr_gencount If the call to nfs_wcc_update_inode() results in an attribute update, we need to ensure that the inode's attr_gencount gets bumped too, otherwise we are not protected against races with other GETATTR calls. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d8512423ba72..1cc600e77bb4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -881,9 +881,10 @@ out: return ret; } -static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); + unsigned long ret = 0; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -891,25 +892,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->change_attr = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; + ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) - && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + ret |= NFS_INO_INVALID_ATTR; + } if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) - && nfsi->npages == 0) - i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + && nfsi->npages == 0) { + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + ret |= NFS_INO_INVALID_ATTR; + } + return ret; } /** @@ -1223,7 +1231,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); + invalid |= nfs_wcc_update_inode(inode, fattr); /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { -- cgit v1.2.1 From 3689456b4bd36027022b3215eb2acba51cd0e6b5 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Tue, 25 Jan 2011 15:07:34 -0800 Subject: squashfs: fix use of uninitialised variable in zlib & xz decompressors Fix potential use of uninitialised variable caused by recent decompressor code optimisations. In zlib_uncompress (zlib_wrapper.c) we have int zlib_err, zlib_init = 0; ... do { ... if (avail == 0) { offset = 0; put_bh(bh[k++]); continue; } ... zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); ... } while (zlib_err == Z_OK); If continue is executed (avail == 0) then the while condition will be evaluated testing zlib_err, which is uninitialised first time around the loop. Fix this by getting rid of the 'if (avail == 0)' condition test, this edge condition should not be being handled in the decompressor code, and instead handle it generically in the caller code. Similarly for xz_wrapper.c. Incidentally, on most architectures (bar Mips and Parisc), no uninitialised variable warning is generated by gcc, this is because the while condition test on continue is optimised out and not performed (when executing continue zlib_err has not been changed since entering the loop, and logically if the while condition was true previously, then it's still true). Signed-off-by: Phillip Lougher Reported-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/block.c | 8 ++++++++ fs/squashfs/xz_wrapper.c | 6 ------ fs/squashfs/zlib_wrapper.c | 6 ------ 3 files changed, 8 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2fb2882f0fa7..8ab48bc2fa7d 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -63,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb, *length = (unsigned char) bh->b_data[*offset] | (unsigned char) bh->b_data[*offset + 1] << 8; *offset += 2; + + if (*offset == msblk->devblksize) { + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *offset = 0; + } } return bh; diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 856756ca5ee4..c4eb40018256 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -95,12 +95,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, if (!buffer_uptodate(bh[k])) goto release_mutex; - if (avail == 0) { - offset = 0; - put_bh(bh[k++]); - continue; - } - stream->buf.in = bh[k]->b_data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 818a5e063faf..4661ae2b1cec 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -82,12 +82,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, if (!buffer_uptodate(bh[k])) goto release_mutex; - if (avail == 0) { - offset = 0; - put_bh(bh[k++]); - continue; - } - stream->next_in = bh[k]->b_data + offset; stream->avail_in = avail; offset = 0; -- cgit v1.2.1 From ac751efa6a0d70f2c9daef5c7e3a92270f5c2dff Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Tue, 25 Jan 2011 15:07:35 -0800 Subject: console: rename acquire/release_console_sem() to console_lock/unlock() The -rt patches change the console_semaphore to console_mutex. As a result, a quite large chunk of the patches changes all acquire/release_console_sem() to acquire/release_console_mutex() This commit makes things use more neutral function names which dont make implications about the underlying lock. The only real change is the return value of console_trylock which is inverted from try_acquire_console_sem() This patch also paves the way to switching console_sem from a semaphore to a mutex. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make console_trylock return 1 on success, per Geert] Signed-off-by: Torben Hohn Cc: Thomas Gleixner Cc: Greg KH Cc: Ingo Molnar Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/consoles.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index eafc22ab1fdd..b701eaa482bf 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -67,7 +67,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) struct console *con; loff_t off = 0; - acquire_console_sem(); + console_lock(); for_each_console(con) if (off++ == *pos) break; @@ -84,7 +84,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) static void c_stop(struct seq_file *m, void *v) { - release_console_sem(); + console_unlock(); } static const struct seq_operations consoles_op = { -- cgit v1.2.1 From c7a360b05b5430ac1d75dc7d53c586ada60a05cb Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 25 Jan 2011 19:15:32 -0500 Subject: NFS construct consistent co_ownerid for v4.1 As stated in section 2.4 of RFC 5661, subsequent instances of the client need to present the same co_ownerid. Concatinate the client's IP dot address, host name, and the rpc_auth pseudoflavor to form the co_ownerid. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9d992b0346e3..78936a8f40ab 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "nfs4_fs.h" #include "delegation.h" @@ -4572,27 +4573,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) *p = htonl((u32)clp->cl_boot_time.tv_nsec); args.verifier = &verifier; - while (1) { - args.id_len = scnprintf(args.id, sizeof(args.id), - "%s/%s %u", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - clp->cl_id_uniquifier); - - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); - - if (status != -NFS4ERR_CLID_INUSE) - break; - - if (signalled()) - break; - - if (++clp->cl_id_uniquifier == 0) - break; - } + args.id_len = scnprintf(args.id, sizeof(args.id), + "%s/%s.%s/%u", + clp->cl_ipaddr, + init_utsname()->nodename, + init_utsname()->domainname, + clp->cl_rpcclient->cl_auth->au_flavor); - status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); + status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + if (!status) + status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); dprintk("<-- %s status= %d\n", __func__, status); return status; } -- cgit v1.2.1 From 8eb2d829ffea3677c21bd038f19e5d8ca6b43e36 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Nov 2010 14:48:01 +0800 Subject: btrfs: Fix threshold calculation for block groups smaller than 1GB If a block group is smaller than 1GB, the extent entry threadhold calculation will always set the threshold to 0. So as free space gets fragmented, btrfs will switch to use bitmap to manage free space, but then will never switch back to extents due to this bug. Reviewed-by: Josef Bacik Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 60d684266959..42f4015988ec 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1016,14 +1016,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; + u64 size = block_group->key.offset; /* * The goal is to keep the total amount of memory used per 1gb of space * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - max_bytes = MAX_CACHE_BYTES_PER_GIG * - (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + if (size < 1024 * 1024 * 1024) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * + div64_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make -- cgit v1.2.1 From edf6e2d1ddbac7f326b34a27adbca71ece53ccce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Nov 2010 14:50:07 +0800 Subject: btrfs: Add helper function free_bitmap() Remove some duplicated code. This prepares for the next patch. Reviewed-by: Josef Bacik Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 42f4015988ec..850104f05178 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1175,6 +1175,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group, recalculate_thresholds(block_group); } +static void free_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info) +{ + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); +} + static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) @@ -1215,13 +1225,8 @@ again: if (*bytes) { struct rb_node *next = rb_next(&bitmap_info->offset_index); - if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!bitmap_info->bytes) + free_bitmap(block_group, bitmap_info); /* * no entry after this bitmap, but we still have bytes to @@ -1254,13 +1259,8 @@ again: return -EAGAIN; goto again; - } else if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + } else if (!bitmap_info->bytes) + free_bitmap(block_group, bitmap_info); return 0; } @@ -1689,13 +1689,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, ret = offset; if (entry->bitmap) { bitmap_clear_bits(block_group, entry, offset, bytes); - if (!entry->bytes) { - unlink_free_space(block_group, entry); - kfree(entry->bitmap); - kfree(entry); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!entry->bytes) + free_bitmap(block_group, entry); } else { unlink_free_space(block_group, entry); entry->offset += bytes; -- cgit v1.2.1 From 70b7da304f9f9bbf1566085155895e32e775a745 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Nov 2010 14:51:45 +0800 Subject: btrfs: Free fully occupied bitmap in cluster If there's no more free space in a bitmap, we should free it. Reviewed-by: Josef Bacik Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 850104f05178..cb0137e4047f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1788,6 +1788,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, ret = search_start; bitmap_clear_bits(block_group, entry, ret, bytes); + if (entry->bytes == 0) + free_bitmap(block_group, entry); out: spin_unlock(&cluster->lock); spin_unlock(&block_group->tree_lock); -- cgit v1.2.1 From 5e71b5d5ec07e4b3fb4c78c4e4b108ff667f123f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Nov 2010 14:55:34 +0800 Subject: btrfs: Update stats when allocating from a cluster When allocating extent entry from a cluster, we should update the free_space and free_extents fields of the block group. Reviewed-by: Josef Bacik Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cb0137e4047f..2974c4744d5c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1843,15 +1843,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, entry->offset += bytes; entry->bytes -= bytes; - if (entry->bytes == 0) { + if (entry->bytes == 0) rb_erase(&entry->offset_index, &cluster->root); - kfree(entry); - } break; } out: spin_unlock(&cluster->lock); + if (!ret) + return 0; + + spin_lock(&block_group->tree_lock); + + block_group->free_space -= bytes; + if (entry->bytes == 0) { + block_group->free_extents--; + kfree(entry); + } + + spin_unlock(&block_group->tree_lock); + return ret; } -- cgit v1.2.1 From 120d66eec0dcb966fbd03f743598b2ff2513436b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Nov 2010 14:56:50 +0800 Subject: btrfs: Add a helper try_merge_free_space() When adding a new extent, we'll firstly see if we can merge this extent to the left or/and right extent. Extract this as a helper try_merge_free_space(). As a side effect, we fix a small bug that if the new extent has non-bitmap left entry but is unmergeble, we'll directly link the extent without trying to drop it into bitmap. This also prepares for the next patch. Reviewed-by: Josef Bacik Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 75 ++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2974c4744d5c..cf67dc3b7bf8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1363,22 +1363,14 @@ out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +bool try_merge_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) { - struct btrfs_free_space *right_info = NULL; - struct btrfs_free_space *left_info = NULL; - struct btrfs_free_space *info = NULL; - int ret = 0; - - info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!info) - return -ENOMEM; - - info->offset = offset; - info->bytes = bytes; - - spin_lock(&block_group->tree_lock); + struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info; + bool merged = false; + u64 offset = info->offset; + u64 bytes = info->bytes; /* * first we want to see if there is free space adjacent to the range we @@ -1392,27 +1384,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, else left_info = tree_search_offset(block_group, offset - 1, 0, 0); - /* - * If there was no extent directly to the left or right of this new - * extent then we know we're going to have to allocate a new extent, so - * before we do that see if we need to drop this into a bitmap - */ - if ((!left_info || left_info->bitmap) && - (!right_info || right_info->bitmap)) { - ret = insert_into_bitmap(block_group, info); - - if (ret < 0) { - goto out; - } else if (ret) { - ret = 0; - goto out; - } - } - if (right_info && !right_info->bitmap) { unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; kfree(right_info); + merged = true; } if (left_info && !left_info->bitmap && @@ -1421,8 +1397,43 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, info->offset = left_info->offset; info->bytes += left_info->bytes; kfree(left_info); + merged = true; } + return merged; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + + if (try_merge_free_space(block_group, info)) + goto link; + + /* + * There was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + ret = insert_into_bitmap(block_group, info); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } +link: ret = link_free_space(block_group, info); if (ret) kfree(info); -- cgit v1.2.1 From f333adb5d64bc1c4d6099072fc341c3c8f84e0cf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Nov 2010 14:57:39 +0800 Subject: btrfs: Check mergeable free space when removing a cluster After returing extents from a cluster to the block group, some extents in the block group may be mergeable. Reviewed-by: Josef Bacik Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cf67dc3b7bf8..a5501edc3c9f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -987,11 +987,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, return entry; } -static void unlink_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_free_space *info) +static inline void +__unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) { rb_erase(&info->offset_index, &block_group->free_space_offset); block_group->free_extents--; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + __unlink_free_space(block_group, info); block_group->free_space -= info->bytes; } @@ -1364,7 +1371,7 @@ out: } bool try_merge_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_free_space *info) + struct btrfs_free_space *info, bool update_stat) { struct btrfs_free_space *left_info; struct btrfs_free_space *right_info; @@ -1385,7 +1392,10 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group, left_info = tree_search_offset(block_group, offset - 1, 0, 0); if (right_info && !right_info->bitmap) { - unlink_free_space(block_group, right_info); + if (update_stat) + unlink_free_space(block_group, right_info); + else + __unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; kfree(right_info); merged = true; @@ -1393,7 +1403,10 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group, if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset) { - unlink_free_space(block_group, left_info); + if (update_stat) + unlink_free_space(block_group, left_info); + else + __unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; kfree(left_info); @@ -1418,7 +1431,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, spin_lock(&block_group->tree_lock); - if (try_merge_free_space(block_group, info)) + if (try_merge_free_space(block_group, info, true)) goto link; /* @@ -1636,6 +1649,7 @@ __btrfs_return_cluster_to_free_space( node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); BUG_ON(entry->bitmap); + try_merge_free_space(block_group, entry, false); tree_insert_offset(&block_group->free_space_offset, entry->offset, &entry->offset_index, 0); } -- cgit v1.2.1 From 83a4d54840c88a4a45c49670f044b8c7ddeaa8c7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 27 Dec 2010 16:19:53 +0800 Subject: Btrfs: Fix memory leak at umount fs_info, which is allocated in open_ctree(), should be freed in close_ctree(). Signed-off-by: Li Zefan --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a5d2249e6da5..089871e5cd5a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2513,6 +2513,8 @@ int close_ctree(struct btrfs_root *root) kfree(fs_info->chunk_root); kfree(fs_info->dev_root); kfree(fs_info->csum_root); + kfree(fs_info); + return 0; } -- cgit v1.2.1 From bdc924bb4cdac92b945945c3149ab8191c92d75d Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 27 Dec 2010 16:33:15 +0800 Subject: Btrfs: Fix memory leak on finding existing super We missed a memory deallocation in commit 450ba0ea. If an existing super block is found at mount and there is no error condition then the pre-allocated tree_root and fs_info are no not used and are not freeded. Signed-off-by: Ian Kent Signed-off-by: Li Zefan --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 61bd79abb805..f50253c2279d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -654,6 +654,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); } else { char b[BDEVNAME_SIZE]; -- cgit v1.2.1 From 3f3d0bc0df041236fad4ffa82188a6e4ef9af75e Mon Sep 17 00:00:00 2001 From: Tero Roponen Date: Mon, 27 Dec 2010 16:43:13 +0800 Subject: Btrfs: Free correct pointer after using strsep We must save and free the original kstrdup()'ed pointer because strsep() modifies its first argument. Signed-off-by: Tero Roponen Signed-off-by: Li Zefan --- fs/btrfs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f50253c2279d..78ee681465af 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -277,7 +277,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *p; + char *opts, *orig, *p; int error = 0; int intarg; @@ -291,6 +291,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, opts = kstrdup(options, GFP_KERNEL); if (!opts) return -ENOMEM; + orig = opts; while ((p = strsep(&opts, ",")) != NULL) { int token; @@ -326,7 +327,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } out_free_opts: - kfree(opts); + kfree(orig); out: /* * If no subvolume name is specified we use the default one. Allocate -- cgit v1.2.1 From d0f69686c2ae775529aadc7a8acc6f13ad41de66 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 25 Jan 2011 15:46:17 +0800 Subject: Btrfs: Don't return acl info when mounting with noacl option Steps to reproduce: # mkfs.btrfs /dev/sda2 # mount /dev/sda2 /mnt # touch /mnt/file0 # setfacl -m 'u:root:x,g::x,o::x' /mnt/file0 # umount /mnt # mount /dev/sda2 -o noacl /mnt # getfacl /mnt/file0 ... user::rw- user:root:--x group::--x mask::--x other::--x The output should be: user::rw- group::--x other::--x Signed-off-by: Miao Xie Signed-off-by: Li Zefan --- fs/btrfs/acl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 2222d161c7b6..3c52fc8afe29 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; + if (!IS_POSIXACL(inode)) + return NULL; + acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) return acl; @@ -82,6 +85,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, struct posix_acl *acl; int ret = 0; + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + acl = btrfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) -- cgit v1.2.1 From b897abec032deb7cc3ce67392a1f544ac965ddea Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 26 Jan 2011 16:19:22 +0800 Subject: Btrfs: Fix memory leak in writepage fixup work fixup, which is allocated when starting page write to fix up the extent without ORDERED bit set, should be freed after this work is done. Signed-off-by: Miao Xie Signed-off-by: Li Zefan --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f9194438f7c..3a6edc4c5642 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1544,6 +1544,7 @@ out: out_page: unlock_page(page); page_cache_release(page); + kfree(fixup); } /* -- cgit v1.2.1 From 4d728ec7aefdca5419d2ebfb28c147e81a4b59f4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 26 Jan 2011 14:10:43 +0800 Subject: Btrfs: Fix file clone when source offset is not 0 Suppose: - the source extent is: [0, 100] - the src offset is 10 - the clone length is 90 - the dest offset is 0 This statement: new_key.offset = key.offset + destoff - off will produce such an extent for the dest file: [ino, BTRFS_EXTENT_DATA_KEY, -10] , which is obviously wrong. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f87552a1d7ea..1b61dab64062 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1788,7 +1788,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = inode->i_ino; - new_key.offset = key.offset + destoff - off; + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { -- cgit v1.2.1 From 7db37c5e6575b229a5051be1d3ef15257ae0ba5d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jan 2011 12:02:00 +1100 Subject: xfs: fix log ticket leak on forced shutdown. The kmemleak detector shows this after test 139: unreferenced object 0xffff880079b88bb0 (size 264): comm "xfs_io", pid 4904, jiffies 4294909382 (age 276.824s) hex dump (first 32 bytes): 00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00 .....N.......... ff ff ff ff ff ff ff ff 48 7b c9 82 ff ff ff ff ........H{...... backtrace: [] kmemleak_alloc+0x2d/0x60 [] kmem_cache_alloc+0x13f/0x2b0 [] kmem_zone_alloc+0x77/0xf0 [] kmem_zone_zalloc+0x1e/0x50 [] xlog_ticket_alloc+0x34/0x170 [] xlog_cil_push+0xa4/0x3f0 [] xlog_cil_force_lsn+0x15a/0x160 [] _xfs_log_force_lsn+0x75/0x2d0 [] _xfs_trans_commit+0x2bd/0x2f0 [] xfs_iomap_write_allocate+0x1ad/0x350 [] xfs_map_blocks+0x21f/0x370 [] xfs_vm_writepage+0x1c7/0x550 [] __writepage+0x1a/0x50 [] write_cache_pages+0x1c2/0x4c0 [] generic_writepages+0x27/0x30 [] xfs_vm_writepages+0x5d/0x80 By inspection, the leak occurs when xlog_write() returns and error and we jump to the abort path without dropping the reference on the active ticket. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_log_cil.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9dc8125d04e5..c7eac5acbfea 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -543,7 +543,7 @@ xlog_cil_push( error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); if (error) - goto out_abort; + goto out_abort_free_ticket; /* * now that we've written the checkpoint into the log, strictly @@ -569,8 +569,9 @@ restart: } spin_unlock(&cil->xc_cil_lock); + /* xfs_log_done always frees the ticket on error. */ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); - if (error || commit_lsn == -1) + if (commit_lsn == -1) goto out_abort; /* attach all the transactions w/ busy extents to iclog */ @@ -600,6 +601,8 @@ out_free_ticket: kmem_free(new_ctx); return 0; +out_abort_free_ticket: + xfs_log_ticket_put(tic); out_abort: xlog_cil_committed(ctx, XFS_LI_ABORTED); return XFS_ERROR(EIO); -- cgit v1.2.1 From ee2c9258501f83d3ed0fd09ce5df1cec53312cf0 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Thu, 27 Jan 2011 09:58:04 -0600 Subject: cifs: More crypto cleanup (try #2) Replaced md4 hashing function local to cifs module with kernel crypto APIs. As a result, md4 hashing function and its supporting functions in file md4.c are not needed anymore. Cleaned up function declarations, removed forward function declarations, and removed a header file that is being deleted from being included. Verified that sec=ntlm/i, sec=ntlmv2/i, and sec=ntlmssp/i work correctly. Signed-off-by: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/Makefile | 2 +- fs/cifs/cifsencrypt.c | 32 +++++--- fs/cifs/cifsencrypt.h | 33 -------- fs/cifs/cifsproto.h | 9 ++- fs/cifs/connect.c | 6 +- fs/cifs/link.c | 5 +- fs/cifs/md4.c | 205 -------------------------------------------------- fs/cifs/smbdes.c | 1 - fs/cifs/smbencrypt.c | 90 +++++++++++++++------- 9 files changed, 97 insertions(+), 286 deletions(-) delete mode 100644 fs/cifs/cifsencrypt.h delete mode 100644 fs/cifs/md4.c (limited to 'fs') diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index e1322296cb69..d87558448e3d 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ - md4.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ + cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o cifs-$(CONFIG_CIFS_ACL) += cifsacl.o diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 35bf329c90e1..0db5f1de0227 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -36,11 +36,6 @@ /* Note that the smb header signature field on input contains the sequence number before this function is called */ -extern void mdfour(unsigned char *out, unsigned char *in, int n); -extern void E_md4hash(const unsigned char *passwd, unsigned char *p16); -extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, - unsigned char *p24); - static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, char *signature) { @@ -233,6 +228,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, /* first calculate 24 bytes ntlm response and then 16 byte session key */ int setup_ntlm_response(struct cifsSesInfo *ses) { + int rc = 0; unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; char temp_key[CIFS_SESS_KEY_SIZE]; @@ -246,13 +242,26 @@ int setup_ntlm_response(struct cifsSesInfo *ses) } ses->auth_key.len = temp_len; - SMBNTencrypt(ses->password, ses->server->cryptkey, + rc = SMBNTencrypt(ses->password, ses->server->cryptkey, ses->auth_key.response + CIFS_SESS_KEY_SIZE); + if (rc) { + cFYI(1, "%s Can't generate NTLM response, error: %d", + __func__, rc); + return rc; + } + + rc = E_md4hash(ses->password, temp_key); + if (rc) { + cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); + return rc; + } - E_md4hash(ses->password, temp_key); - mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); + rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); + if (rc) + cFYI(1, "%s Can't generate NTLM session key, error: %d", + __func__, rc); - return 0; + return rc; } #ifdef CONFIG_CIFS_WEAK_PW_HASH @@ -699,14 +708,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) unsigned int size; server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); - if (!server->secmech.hmacmd5 || - IS_ERR(server->secmech.hmacmd5)) { + if (IS_ERR(server->secmech.hmacmd5)) { cERROR(1, "could not allocate crypto hmacmd5\n"); return PTR_ERR(server->secmech.hmacmd5); } server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); - if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) { + if (IS_ERR(server->secmech.md5)) { cERROR(1, "could not allocate crypto md5\n"); rc = PTR_ERR(server->secmech.md5); goto crypto_allocate_md5_fail; diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h deleted file mode 100644 index 15d2ec006474..000000000000 --- a/fs/cifs/cifsencrypt.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * fs/cifs/cifsencrypt.h - * - * Copyright (c) International Business Machines Corp., 2005 - * Author(s): Steve French (sfrench@us.ibm.com) - * - * Externs for misc. small encryption routines - * so we do not have to put them in cifsproto.h - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -/* md4.c */ -extern void mdfour(unsigned char *out, unsigned char *in, int n); -/* smbdes.c */ -extern void E_P16(unsigned char *p14, unsigned char *p16); -extern void E_P24(unsigned char *p21, const unsigned char *c8, - unsigned char *p24); - - - diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 35c989f4924f..8096f27ad9a8 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -375,7 +375,7 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, extern int cifs_verify_signature(struct smb_hdr *, struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); +extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); extern int setup_ntlm_response(struct cifsSesInfo *); extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *); extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); @@ -425,4 +425,11 @@ extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, const unsigned char *path, struct cifs_sb_info *cifs_sb, int xid); +extern int mdfour(unsigned char *, unsigned char *, int); +extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); +extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, + unsigned char *p24); +extern void E_P16(unsigned char *p14, unsigned char *p16); +extern void E_P24(unsigned char *p21, const unsigned char *c8, + unsigned char *p24); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 47034af67b09..47d8ff623683 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -55,9 +55,6 @@ /* SMB echo "timeout" -- FIXME: tunable? */ #define SMB_ECHO_INTERVAL (60 * HZ) -extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, - unsigned char *p24); - extern mempool_t *cifs_req_poolp; struct smb_vol { @@ -2990,7 +2987,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr); else #endif /* CIFS_WEAK_PW_HASH */ - SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr); + rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, + bcc_ptr); bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { diff --git a/fs/cifs/link.c b/fs/cifs/link.c index d3444ea6ac71..02cd60aefbff 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -54,10 +54,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) struct sdesc *sdescmd5; md5 = crypto_alloc_shash("md5", 0, 0); - if (!md5 || IS_ERR(md5)) { - rc = PTR_ERR(md5); + if (IS_ERR(md5)) { cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc); - return rc; + return PTR_ERR(md5); } size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); sdescmd5 = kmalloc(size, GFP_KERNEL); diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c deleted file mode 100644 index a725c2609d67..000000000000 --- a/fs/cifs/md4.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - Unix SMB/Netbios implementation. - Version 1.9. - a implementation of MD4 designed for use in the SMB authentication protocol - Copyright (C) Andrew Tridgell 1997-1998. - Modified by Steve French (sfrench@us.ibm.com) 2002-2003 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ -#include -#include -#include "cifsencrypt.h" - -/* NOTE: This code makes no attempt to be fast! */ - -static __u32 -F(__u32 X, __u32 Y, __u32 Z) -{ - return (X & Y) | ((~X) & Z); -} - -static __u32 -G(__u32 X, __u32 Y, __u32 Z) -{ - return (X & Y) | (X & Z) | (Y & Z); -} - -static __u32 -H(__u32 X, __u32 Y, __u32 Z) -{ - return X ^ Y ^ Z; -} - -static __u32 -lshift(__u32 x, int s) -{ - x &= 0xFFFFFFFF; - return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s)); -} - -#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s) -#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s) -#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s) - -/* this applies md4 to 64 byte chunks */ -static void -mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D) -{ - int j; - __u32 AA, BB, CC, DD; - __u32 X[16]; - - - for (j = 0; j < 16; j++) - X[j] = M[j]; - - AA = *A; - BB = *B; - CC = *C; - DD = *D; - - ROUND1(A, B, C, D, 0, 3); - ROUND1(D, A, B, C, 1, 7); - ROUND1(C, D, A, B, 2, 11); - ROUND1(B, C, D, A, 3, 19); - ROUND1(A, B, C, D, 4, 3); - ROUND1(D, A, B, C, 5, 7); - ROUND1(C, D, A, B, 6, 11); - ROUND1(B, C, D, A, 7, 19); - ROUND1(A, B, C, D, 8, 3); - ROUND1(D, A, B, C, 9, 7); - ROUND1(C, D, A, B, 10, 11); - ROUND1(B, C, D, A, 11, 19); - ROUND1(A, B, C, D, 12, 3); - ROUND1(D, A, B, C, 13, 7); - ROUND1(C, D, A, B, 14, 11); - ROUND1(B, C, D, A, 15, 19); - - ROUND2(A, B, C, D, 0, 3); - ROUND2(D, A, B, C, 4, 5); - ROUND2(C, D, A, B, 8, 9); - ROUND2(B, C, D, A, 12, 13); - ROUND2(A, B, C, D, 1, 3); - ROUND2(D, A, B, C, 5, 5); - ROUND2(C, D, A, B, 9, 9); - ROUND2(B, C, D, A, 13, 13); - ROUND2(A, B, C, D, 2, 3); - ROUND2(D, A, B, C, 6, 5); - ROUND2(C, D, A, B, 10, 9); - ROUND2(B, C, D, A, 14, 13); - ROUND2(A, B, C, D, 3, 3); - ROUND2(D, A, B, C, 7, 5); - ROUND2(C, D, A, B, 11, 9); - ROUND2(B, C, D, A, 15, 13); - - ROUND3(A, B, C, D, 0, 3); - ROUND3(D, A, B, C, 8, 9); - ROUND3(C, D, A, B, 4, 11); - ROUND3(B, C, D, A, 12, 15); - ROUND3(A, B, C, D, 2, 3); - ROUND3(D, A, B, C, 10, 9); - ROUND3(C, D, A, B, 6, 11); - ROUND3(B, C, D, A, 14, 15); - ROUND3(A, B, C, D, 1, 3); - ROUND3(D, A, B, C, 9, 9); - ROUND3(C, D, A, B, 5, 11); - ROUND3(B, C, D, A, 13, 15); - ROUND3(A, B, C, D, 3, 3); - ROUND3(D, A, B, C, 11, 9); - ROUND3(C, D, A, B, 7, 11); - ROUND3(B, C, D, A, 15, 15); - - *A += AA; - *B += BB; - *C += CC; - *D += DD; - - *A &= 0xFFFFFFFF; - *B &= 0xFFFFFFFF; - *C &= 0xFFFFFFFF; - *D &= 0xFFFFFFFF; - - for (j = 0; j < 16; j++) - X[j] = 0; -} - -static void -copy64(__u32 *M, unsigned char *in) -{ - int i; - - for (i = 0; i < 16; i++) - M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) | - (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0); -} - -static void -copy4(unsigned char *out, __u32 x) -{ - out[0] = x & 0xFF; - out[1] = (x >> 8) & 0xFF; - out[2] = (x >> 16) & 0xFF; - out[3] = (x >> 24) & 0xFF; -} - -/* produce a md4 message digest from data of length n bytes */ -void -mdfour(unsigned char *out, unsigned char *in, int n) -{ - unsigned char buf[128]; - __u32 M[16]; - __u32 b = n * 8; - int i; - __u32 A = 0x67452301; - __u32 B = 0xefcdab89; - __u32 C = 0x98badcfe; - __u32 D = 0x10325476; - - while (n > 64) { - copy64(M, in); - mdfour64(M, &A, &B, &C, &D); - in += 64; - n -= 64; - } - - for (i = 0; i < 128; i++) - buf[i] = 0; - memcpy(buf, in, n); - buf[n] = 0x80; - - if (n <= 55) { - copy4(buf + 56, b); - copy64(M, buf); - mdfour64(M, &A, &B, &C, &D); - } else { - copy4(buf + 120, b); - copy64(M, buf); - mdfour64(M, &A, &B, &C, &D); - copy64(M, buf + 64); - mdfour64(M, &A, &B, &C, &D); - } - - for (i = 0; i < 128; i++) - buf[i] = 0; - copy64(M, buf); - - copy4(out, A); - copy4(out + 4, B); - copy4(out + 8, C); - copy4(out + 12, D); - - A = B = C = D = 0; -} diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c index b6b6dcb500bf..04721485925d 100644 --- a/fs/cifs/smbdes.c +++ b/fs/cifs/smbdes.c @@ -45,7 +45,6 @@ up with a different answer to the one above) */ #include -#include "cifsencrypt.h" #define uchar unsigned char static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9, diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 30135005e4f3..b5450e9f40c0 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -33,7 +33,7 @@ #include "cifspdu.h" #include "cifsglob.h" #include "cifs_debug.h" -#include "cifsencrypt.h" +#include "cifsproto.h" #ifndef false #define false 0 @@ -47,14 +47,57 @@ #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) -/*The following definitions come from libsmb/smbencrypt.c */ +/* produce a md4 message digest from data of length n bytes */ +int +mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) +{ + int rc; + unsigned int size; + struct crypto_shash *md4; + struct sdesc *sdescmd4; + + md4 = crypto_alloc_shash("md4", 0, 0); + if (IS_ERR(md4)) { + cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc); + return PTR_ERR(md4); + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); + sdescmd4 = kmalloc(size, GFP_KERNEL); + if (!sdescmd4) { + rc = -ENOMEM; + cERROR(1, "%s: Memory allocation failure\n", __func__); + goto mdfour_err; + } + sdescmd4->shash.tfm = md4; + sdescmd4->shash.flags = 0x0; + + rc = crypto_shash_init(&sdescmd4->shash); + if (rc) { + cERROR(1, "%s: Could not init md4 shash\n", __func__); + goto mdfour_err; + } + crypto_shash_update(&sdescmd4->shash, link_str, link_len); + rc = crypto_shash_final(&sdescmd4->shash, md4_hash); -void SMBencrypt(unsigned char *passwd, const unsigned char *c8, - unsigned char *p24); -void E_md4hash(const unsigned char *passwd, unsigned char *p16); -static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8, - unsigned char p24[24]); -void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); +mdfour_err: + crypto_free_shash(md4); + kfree(sdescmd4); + + return rc; +} + +/* Does the des encryption from the NT or LM MD4 hash. */ +static void +SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8, + unsigned char p24[24]) +{ + unsigned char p21[21]; + + memset(p21, '\0', 21); + + memcpy(p21, passwd, 16); + E_P24(p21, c8, p24); +} /* This implements the X/Open SMB password encryption @@ -117,9 +160,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len) * Creates the MD4 Hash of the users password in NT UNICODE. */ -void +int E_md4hash(const unsigned char *passwd, unsigned char *p16) { + int rc; int len; __u16 wpwd[129]; @@ -138,8 +182,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16) /* Calculate length in bytes */ len = _my_wcslen(wpwd) * sizeof(__u16); - mdfour(p16, (unsigned char *) wpwd, len); + rc = mdfour(p16, (unsigned char *) wpwd, len); memset(wpwd, 0, 129 * 2); + + return rc; } #if 0 /* currently unused */ @@ -211,19 +257,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n, } #endif -/* Does the des encryption from the NT or LM MD4 hash. */ -static void -SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8, - unsigned char p24[24]) -{ - unsigned char p21[21]; - - memset(p21, '\0', 21); - - memcpy(p21, passwd, 16); - E_P24(p21, c8, p24); -} - /* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ #if 0 /* currently unused */ static void @@ -241,16 +274,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8], #endif /* Does the NT MD4 hash then des encryption. */ - -void +int SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) { + int rc; unsigned char p21[21]; memset(p21, '\0', 21); - E_md4hash(passwd, p21); + rc = E_md4hash(passwd, p21); + if (rc) { + cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); + return rc; + } SMBOWFencrypt(p21, c8, p24); + return rc; } -- cgit v1.2.1 From e34a314c5e49fe6b763568f6576b19f1299c33c2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jan 2011 12:13:35 +1100 Subject: xfs: fix efi item leak on forced shutdown After test 139, kmemleak shows: unreferenced object 0xffff880078b405d8 (size 400): comm "xfs_io", pid 4904, jiffies 4294909383 (age 1186.728s) hex dump (first 32 bytes): 60 c1 17 79 00 88 ff ff 60 c1 17 79 00 88 ff ff `..y....`..y.... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x2d/0x60 [] kmem_cache_alloc+0x13f/0x2b0 [] kmem_zone_alloc+0x77/0xf0 [] kmem_zone_zalloc+0x1e/0x50 [] xfs_efi_init+0x4b/0xb0 [] xfs_trans_get_efi+0x58/0x90 [] xfs_bmap_finish+0x8b/0x1d0 [] xfs_itruncate_finish+0x2c4/0x5d0 [] xfs_setattr+0x8df/0xa70 [] xfs_vn_setattr+0x1b/0x20 [] notify_change+0x170/0x2e0 [] do_truncate+0x66/0xa0 [] sys_ftruncate+0xdb/0xe0 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff The cause of the leak is that the "remove" parameter of IOP_UNPIN() is never set when a CIL push is aborted. This means that the EFI item is never freed if it was in the push being cancelled. The problem is specific to delayed logging, but has uncovered a couple of problems with the handling of IOP_UNPIN(remove). Firstly, we cannot safely call xfs_trans_del_item() from IOP_UNPIN() in the CIL commit failure path or the iclog write failure path because for delayed loging we have no transaction context. Hence we must only call xfs_trans_del_item() if the log item being unpinned has an active log item descriptor. Secondly, xfs_trans_uncommit() does not handle log item descriptor freeing during the traversal of log items on a transaction. It can reference a freed log item descriptor when unpinning an EFI item. Hence it needs to use a safe list traversal method to allow items to be removed from the transaction during IOP_UNPIN(). Signed-off-by: Dave Chinner Reviewed-by: Alex Elder --- fs/xfs/xfs_buf_item.c | 12 +++++++----- fs/xfs/xfs_extfree_item.c | 3 ++- fs/xfs/xfs_trans.c | 36 +++++++++++++++++++++++++++++------- 3 files changed, 38 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 98c6f73b6752..6f8c21ce0d6d 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -427,13 +427,15 @@ xfs_buf_item_unpin( if (remove) { /* - * We have to remove the log item from the transaction - * as we are about to release our reference to the - * buffer. If we don't, the unlock that occurs later - * in xfs_trans_uncommit() will ry to reference the + * If we are in a transaction context, we have to + * remove the log item from the transaction as we are + * about to release our reference to the buffer. If we + * don't, the unlock that occurs later in + * xfs_trans_uncommit() will try to reference the * buffer which we no longer have a hold on. */ - xfs_trans_del_item(lip); + if (lip->li_desc) + xfs_trans_del_item(lip); /* * Since the transaction no longer refers to the buffer, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 75f2ef60e579..d22e62623437 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -138,7 +138,8 @@ xfs_efi_item_unpin( if (remove) { ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); - xfs_trans_del_item(lip); + if (lip->li_desc) + xfs_trans_del_item(lip); xfs_efi_item_free(efip); return; } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 33dbc4e0ad62..29f5e5424897 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1446,6 +1446,14 @@ xfs_log_item_batch_insert( * Bulk operation version of xfs_trans_committed that takes a log vector of * items to insert into the AIL. This uses bulk AIL insertion techniques to * minimise lock traffic. + * + * If we are called with the aborted flag set, it is because a log write during + * a CIL checkpoint commit has failed. In this case, all the items in the + * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which + * means that checkpoint commit abort handling is treated exactly the same + * as an iclog write error even though we haven't started any IO yet. Hence in + * this case all we need to do is IOP_COMMITTED processing, followed by an + * IOP_UNPIN(aborted) call. */ void xfs_trans_committed_bulk( @@ -1472,6 +1480,16 @@ xfs_trans_committed_bulk( if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) continue; + /* + * if we are aborting the operation, no point in inserting the + * object into the AIL as we are in a shutdown situation. + */ + if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + IOP_UNPIN(lip, 1); + continue; + } + if (item_lsn != commit_lsn) { /* @@ -1503,20 +1521,24 @@ xfs_trans_committed_bulk( } /* - * Called from the trans_commit code when we notice that - * the filesystem is in the middle of a forced shutdown. + * Called from the trans_commit code when we notice that the filesystem is in + * the middle of a forced shutdown. + * + * When we are called here, we have already pinned all the items in the + * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called + * so we can simply walk the items in the transaction, unpin them with an abort + * flag and then free the items. Note that unpinning the items can result in + * them being freed immediately, so we need to use a safe list traversal method + * here. */ STATIC void xfs_trans_uncommit( struct xfs_trans *tp, uint flags) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item_desc *lidp, *n; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* - * Unpin all but those that aren't dirty. - */ + list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) { if (lidp->lid_flags & XFS_LID_DIRTY) IOP_UNPIN(lidp->lid_item, 1); } -- cgit v1.2.1 From b8fc82630ae289bb4e661567808afc59e3298dce Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jan 2011 12:14:12 +1100 Subject: xfs: speculative delayed allocation uses rounddown_power_of_2 badly rounddown_power_of_2() returns an undefined result when passed a value of zero. The specualtive delayed allocation code is doing this when the inode is zero length. Hence occasionally the preallocation is much, much larger than is necessary (e.g. 8GB for a 270 _byte_ file). Ensure we don't even pass a zero value to this function so the result of preallocation is always the desired size. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_iomap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 55582bd66659..8a0f044750c3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -337,7 +337,12 @@ xfs_iomap_prealloc_size( int shift = 0; int64_t freesp; - alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size); + /* + * rounddown_pow_of_two() returns an undefined result + * if we pass in alloc_blocks = 0. Hence the "+ 1" to + * ensure we always pass in a non-zero value. + */ + alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); -- cgit v1.2.1 From 14b064ceaa6f51a7426cc45b4b43685b94380658 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jan 2011 12:16:28 +1100 Subject: xfs: limit extent length for allocation to AG size Delayed allocation extents can be larger than AGs, so when trying to convert a large range we may scan every AG inside xfs_bmap_alloc_nullfb() trying to find an AG with a size larger than an AG. We should stop when we find the first AG with a maximum possible allocation size. This causes excessive CPU usage when there are lots of AGs. The same problem occurs when doing preallocation of a range larger than an AG. Fix the problem by limiting real allocation lengths to the maximum that an AG can support. This means if we have empty AGs, we'll stop the search at the first of them. If there are no empty AGs, we'll still scan them all, but that is a different problem.... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_alloc.h | 16 ++++++++++++++++ fs/xfs/xfs_bmap.c | 18 ++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 0ab56b32c7eb..d0b3bc72005b 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -74,6 +74,22 @@ typedef unsigned int xfs_alloctype_t; */ #define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) +/* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks + * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +#define XFS_ALLOC_AG_MAX_USABLE(mp) \ + ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) + + /* * Argument structure for xfs_alloc routines. * This is turned into a structure to avoid having 20 arguments passed diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 4111cd3966c7..f3a3768189bb 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2430,7 +2430,7 @@ xfs_bmap_btalloc_nullfb( startag = ag = 0; pag = xfs_perag_get(mp, ag); - while (*blen < ap->alen) { + while (*blen < args->maxlen) { if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, args->tp, ag, XFS_ALLOC_FLAG_TRYLOCK); @@ -2452,7 +2452,7 @@ xfs_bmap_btalloc_nullfb( notinit = 1; if (xfs_inode_is_filestream(ap->ip)) { - if (*blen >= ap->alen) + if (*blen >= args->maxlen) break; if (ap->userdata) { @@ -2498,14 +2498,14 @@ xfs_bmap_btalloc_nullfb( * If the best seen length is less than the request * length, use the best as the minimum. */ - else if (*blen < ap->alen) + else if (*blen < args->maxlen) args->minlen = *blen; /* - * Otherwise we've seen an extent as big as alen, + * Otherwise we've seen an extent as big as maxlen, * use that as the minimum. */ else - args->minlen = ap->alen; + args->minlen = args->maxlen; /* * set the failure fallback case to look in the selected @@ -2573,7 +2573,9 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->rval; - args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); args.firstblock = ap->firstblock; blen = 0; if (nullfb) { @@ -2621,7 +2623,7 @@ xfs_bmap_btalloc( /* * Adjust for alignment */ - if (blen > args.alignment && blen <= ap->alen) + if (blen > args.alignment && blen <= args.maxlen) args.minlen = blen - args.alignment; args.minalignslop = 0; } else { @@ -2640,7 +2642,7 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= ap->alen) + if (blen > mp->m_dalign && blen <= args.maxlen) nextminlen = blen - mp->m_dalign; else nextminlen = args.minlen; -- cgit v1.2.1 From 4ce159890c00e2cc705e955a939bf1dca7b07ab8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jan 2011 12:17:58 +1100 Subject: xfs: prevent extsize alignment from exceeding maximum extent size When doing delayed allocation, if the allocation size is for a maximally sized extent, extent size alignment can push it over this limit. This results in an assert failure in xfs_bmbt_set_allf() as the extent length is too large to find in the extent record. Fix this by ensuring that we allow for space that extent size alignment requires (up to 2 * (extsize -1) blocks as we have to handle both head and tail alignment) when limiting the maximum size of the extent. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_bmap.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index f3a3768189bb..3e9c278a8f78 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4487,6 +4487,16 @@ xfs_bmapi( /* Figure out the extent size, adjust alen */ extsz = xfs_get_extsz_hint(ip); if (extsz) { + /* + * make sure we don't exceed a single + * extent length when we align the + * extent by reducing length we are + * going to allocate by the maximum + * amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, + MAXEXTLEN - (2 * extsz - 1)); error = xfs_bmap_extsize_align(mp, &got, &prev, extsz, rt, eof, -- cgit v1.2.1 From 5315837daee7ed76c31ef643915f7d76ef8c1aa3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jan 2011 12:18:18 +1100 Subject: xfs: limit extsize to size of AGs and/or MAXEXTLEN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extent size hint can be set to larger than an AG. This means that the alignment process can push the range to be allocated outside the bounds of the AG, resulting in assert failures or corrupted bmbt records. Similarly, if the extsize is larger than the maximum extent size supported, the alignment process will produce extents that are too large to fit into the bmbt records, resulting in a different type of assert/corruption failure. Fix this by limiting extsize at the time Ñ–t is set firstly to be less than MAXEXTLEN, then to be a maximum of half the size of the AGs in the filesystem for non-realtime inodes. Realtime inodes do not allocate out of AGs, so don't have to be restricted by the size of AGs. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_ioctl.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index b06ede1d0bed..f5e2a19e0f8e 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -985,10 +985,22 @@ xfs_ioctl_setattr( /* * Extent size must be a multiple of the appropriate block - * size, if set at all. + * size, if set at all. It must also be smaller than the + * maximum extent size supported by the filesystem. + * + * Also, for non-realtime files, limit the extent size hint to + * half the size of the AGs in the filesystem so alignment + * doesn't result in extents larger than an AG. */ if (fa->fsx_extsize != 0) { - xfs_extlen_t size; + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; + + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) { + code = XFS_ERROR(EINVAL); + goto error_return; + } if (XFS_IS_REALTIME_INODE(ip) || ((mask & FSX_XFLAGS) && @@ -997,6 +1009,10 @@ xfs_ioctl_setattr( mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { + code = XFS_ERROR(EINVAL); + goto error_return; + } } if (fa->fsx_extsize % size) { -- cgit v1.2.1 From c6f990d1ff8e4e53b12f4175eb7d7ea710c3ca73 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Jan 2011 13:23:28 +1100 Subject: xfs: handle CIl transaction commit failures correctly Failure to commit a transaction into the CIL is not handled correctly. This currently can only happen when racing with a shutdown and requires an explicit shutdown check, so it rare and can be avoided. Remove the shutdown check and make the CIL commit a void function to indicate it will always succeed, thereby removing the incorrectly handled failure case. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_log.h | 2 +- fs/xfs/xfs_log_cil.c | 8 +------- fs/xfs/xfs_trans.c | 5 +---- 3 files changed, 3 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 916eb7db14d9..3bd3291ef8d2 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket); xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); -int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c7eac5acbfea..9ca59be08977 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -625,7 +625,7 @@ out_abort: * background commit, returns without it held once background commits are * allowed again. */ -int +void xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, @@ -640,11 +640,6 @@ xfs_log_commit_cil( if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; - if (XLOG_FORCED_SHUTDOWN(log)) { - xlog_cil_free_logvec(log_vector); - return XFS_ERROR(EIO); - } - /* * do all the hard work of formatting items (including memory * allocation) outside the CIL context lock. This prevents stalling CIL @@ -704,7 +699,6 @@ xfs_log_commit_cil( */ if (push) xlog_cil_push(log, 0); - return 0; } /* diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 29f5e5424897..76922793f64f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1755,7 +1755,6 @@ xfs_trans_commit_cil( int flags) { struct xfs_log_vec *log_vector; - int error; /* * Get each log item to allocate a vector structure for @@ -1766,9 +1765,7 @@ xfs_trans_commit_cil( if (!log_vector) return ENOMEM; - error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); - if (error) - return error; + xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); -- cgit v1.2.1 From 0fbca4d1c3932c27c4794bf5c2b5fc961cf5a54f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 28 Jan 2011 11:20:46 +1100 Subject: xfs: fix dquot shaker deadlock Commit 368e136 ("xfs: remove duplicate code from dquot reclaim") fails to unlock the dquot freelist when the number of loop restarts is exceeded in xfs_qm_dqreclaim_one(). This causes hangs in memory reclaim. Rework the loop control logic into an unwind stack that all the different cases jump into. This means there is only one set of code that processes the loop exit criteria, and simplifies the unlocking of all the items from different points in the loop. It also fixes a double increment of the restart counter from the qi_dqlist_lock case. Reported-by: Malcolm Scott Signed-off-by: Dave Chinner Reviewed-by: Alex Elder --- fs/xfs/quota/xfs_qm.c | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index f8e854b4fde8..206a2815ced6 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void) xfs_dquot_t *dqpout; xfs_dquot_t *dqp; int restarts; + int startagain; restarts = 0; dqpout = NULL; /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ -startagain: +again: + startagain = 0; mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { @@ -1885,13 +1887,10 @@ startagain: ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); trace_xfs_dqreclaim_want(dqp); - - xfs_dqunlock(dqp); - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto startagain; + restarts++; + startagain = 1; + goto dqunlock; } /* @@ -1906,23 +1905,20 @@ startagain: ASSERT(list_empty(&dqp->q_mplist)); list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); dqpout = dqp; XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - break; + goto dqunlock; } ASSERT(dqp->q_hash); ASSERT(!list_empty(&dqp->q_mplist)); /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. + * Try to grab the flush lock. If this dquot is in the process + * of getting flushed to disk, we don't want to reclaim it. */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - continue; - } + if (!xfs_dqflock_nowait(dqp)) + goto dqunlock; /* * We have the flush lock so we know that this is not in the @@ -1944,8 +1940,7 @@ startagain: xfs_fs_cmn_err(CE_WARN, mp, "xfs_qm_dqreclaim: dquot %p flush failed", dqp); } - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - continue; + goto dqunlock; } /* @@ -1967,13 +1962,8 @@ startagain: */ if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { restarts++; - mutex_unlock(&dqp->q_hash->qh_lock); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; - goto startagain; + startagain = 1; + goto qhunlock; } ASSERT(dqp->q_nrefs == 0); @@ -1986,14 +1976,20 @@ startagain: xfs_Gqm->qm_dqfrlist_cnt--; dqpout = dqp; mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); +qhunlock: mutex_unlock(&dqp->q_hash->qh_lock); dqfunlock: xfs_dqfunlock(dqp); +dqunlock: xfs_dqunlock(dqp); if (dqpout) break; if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; + break; + if (startagain) { + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + goto again; + } } mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); return dqpout; -- cgit v1.2.1 From 24446fc66fdebbdd8baca0f44fd2a47ad77ba580 Mon Sep 17 00:00:00 2001 From: "bpm@sgi.com" Date: Wed, 19 Jan 2011 17:41:58 +0000 Subject: xfs: xfs_bmap_add_extent_delay_real should init br_startblock When filling in the middle of a previous delayed allocation in xfs_bmap_add_extent_delay_real, set br_startblock of the new delay extent to the right to nullstartblock instead of 0 before inserting the extent into the ifork (xfs_iext_insert), rather than setting br_startblock afterward. Adding the extent into the ifork with br_startblock=0 can lead to the extent being copied into the btree by xfs_bmap_extent_to_btree if we happen to convert from extents format to btree format before updating br_startblock with the correct value. The unexpected addition of this delay extent to the btree can cause subsequent XFS_WANT_CORRUPTED_GOTO filesystem shutdown in several xfs_bmap_add_extent_delay_real cases where we are converting a delay extent to real and unexpectedly find an extent already inserted. For example: 911 case BMAP_LEFT_FILLING: 912 /* 913 * Filling in the first part of a previous delayed allocation. 914 * The left neighbor is not contiguous. 915 */ 916 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 917 xfs_bmbt_set_startoff(ep, new_endoff); 918 temp = PREV.br_blockcount - new->br_blockcount; 919 xfs_bmbt_set_blockcount(ep, temp); 920 xfs_iext_insert(ip, idx, 1, new, state); 921 ip->i_df.if_lastex = idx; 922 ip->i_d.di_nextents++; 923 if (cur == NULL) 924 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 925 else { 926 rval = XFS_ILOG_CORE; 927 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, 928 new->br_startblock, new->br_blockcount, 929 &i))) 930 goto done; 931 XFS_WANT_CORRUPTED_GOTO(i == 0, done); With the bogus extent in the btree we shutdown the filesystem at 931. The conversion from extents to btree format happens when the number of extents in the inode increases above ip->i_df.if_ext_max. xfs_bmap_extent_to_btree copies extents from the ifork into the btree, ignoring all delalloc extents which are denoted by br_startblock having some value of nullstartblock. SGI-PV: 1013221 Signed-off-by: Ben Myers Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 3e9c278a8f78..dc3afd7739ff 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real( * Filling in the middle part of a previous delayed allocation. * Contiguity is impossible here. * This case is avoided almost all the time. + * + * We start with a delayed allocation: + * + * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+ + * PREV @ idx + * + * and we are allocating: + * +rrrrrrrrrrrrrrrrr+ + * new + * + * and we set it up for insertion as: + * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+ + * new + * PREV @ idx LEFT RIGHT + * inserted at idx + 1 */ temp = new->br_startoff - PREV.br_startoff; - trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - r[0] = *new; - r[1].br_state = PREV.br_state; - r[1].br_startblock = 0; - r[1].br_startoff = new_endoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - r[1].br_blockcount = temp2; - xfs_iext_insert(ip, idx + 1, 2, &r[0], state); + trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + LEFT = *new; + RIGHT.br_state = PREV.br_state; + RIGHT.br_startblock = nullstartblock( + (int)xfs_bmap_worst_indlen(ip, temp2)); + RIGHT.br_startoff = new_endoff; + RIGHT.br_blockcount = temp2; + /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ + xfs_iext_insert(ip, idx + 1, 2, &LEFT, state); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) -- cgit v1.2.1 From e00b8a24041f37e56b4b8415ce4eba1cbc238065 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Jan 2011 14:55:39 -0500 Subject: NFS: Fix an NFS client lockdep issue There is no reason to be freeing the delegation cred in the rcu callback, and doing so is resulting in a lockdep complaint that rpc_credcache_lock is being called from both softirq and non-softirq contexts. Reported-by: Chuck Lever Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/delegation.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 364e4328f392..bbbc6bf5cb2e 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -23,8 +23,6 @@ static void nfs_do_free_delegation(struct nfs_delegation *delegation) { - if (delegation->cred) - put_rpccred(delegation->cred); kfree(delegation); } @@ -37,6 +35,10 @@ static void nfs_free_delegation_callback(struct rcu_head *head) static void nfs_free_delegation(struct nfs_delegation *delegation) { + if (delegation->cred) { + put_rpccred(delegation->cred); + delegation->cred = NULL; + } call_rcu(&delegation->rcu, nfs_free_delegation_callback); } -- cgit v1.2.1 From c08e76d0cd4beb759a73c1835d98f5fccc126ed1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Jan 2011 12:40:55 -0500 Subject: NFS: Micro-optimize nfs4_decode_dirent() Make the decoding of NFSv4 directory entries slightly more efficient by: 1. Avoiding unnecessary byte swapping when checking XDR booleans, and 2. Not bumping "p" when its value will be immediately replaced by xdr_inline_decode() This commit makes nfs4_decode_dirent() consistent with similar logic in the other two decode_dirent() functions. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 2ab8e5cb8f59..009aef9e12bc 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6086,11 +6086,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - if (!ntohl(*p++)) { + if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - if (!ntohl(*p++)) + if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; return -EBADCOOKIE; @@ -6101,7 +6101,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, goto out_overflow; entry->prev_cookie = entry->cookie; p = xdr_decode_hyper(p, &entry->cookie); - entry->len = ntohl(*p++); + entry->len = be32_to_cpup(p); p = xdr_inline_decode(xdr, entry->len); if (unlikely(!p)) -- cgit v1.2.1 From d1205f87bbb8040c1408bbd9e0a720310b2b0b9b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 28 Jan 2011 12:41:05 -0500 Subject: NFS: NFSv4 readdir loses entries On recent 2.6.38-rc kernels, connectathon basic test 6 fails on NFSv4 mounts of OpenSolaris with something like: > ./test6: readdir > ./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.12' dir entry, pass 0 > ./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.82' dir entry, pass 0 > ./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.164' dir entry, pass 0 > ./test6: (/mnt/klimt/matisse.test) Test failed with 3 errors > basic tests failed > Tests failed, leaving /mnt/klimt mounted > [cel@matisse cthon04]$ I narrowed the problem down to nfs4_decode_dirent() reporting that the decode buffer had overflowed while decoding the entries for those missing files. verify_attr_len() assumes both it's pointer arguments reside on the same page. When these arguments point to locations on two different pages, verify_attr_len() can report false errors. This can happen now that a large NFSv4 readdir result can span pages. We have reasonably good checking in nfs4_decode_dirent() anyway, so it should be safe to simply remove the extra checking. At a guess, this was introduced by commit 6650239a, "NFS: Don't use vm_map_ram() in readdir". Cc: stable@kernel.org [2.6.37] Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 009aef9e12bc..4e2c168b6ee9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6132,9 +6132,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); - if (verify_attr_len(xdr, p, len) < 0) - goto out_overflow; - return 0; out_overflow: -- cgit v1.2.1 From 6b82ce8d824bd46053e46a895876cde39d9026e4 Mon Sep 17 00:00:00 2001 From: liubo Date: Wed, 26 Jan 2011 06:21:39 +0000 Subject: btrfs: fix uncheck memory allocation in btrfs_submit_compressed_read btrfs_submit_compressed_read() is lack of memory allocation checks and corresponding error route. After this fix, if it comes to "no memory" case, errno will be returned to userland step by step, and tell users this operation cannot go on. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 25 +++++++++++++++++++++++-- fs/btrfs/extent_io.c | 4 ++-- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f745287fbf2e..3a932f183da1 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -562,7 +562,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret; + int ret = -ENOMEM; u32 *sums; tree = &BTRFS_I(inode)->io_tree; @@ -577,6 +577,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + goto out; + atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -597,13 +600,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; - cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + if (!cb->compressed_pages) + goto fail1; + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; for (page_index = 0; page_index < nr_pages; page_index++) { cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!cb->compressed_pages[page_index]) + goto fail2; } cb->nr_pages = nr_pages; @@ -614,6 +622,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + if (!comp_bio) + goto fail2; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); @@ -681,6 +691,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_put(comp_bio); return 0; + +fail2: + for (page_index = 0; page_index < nr_pages; page_index++) + free_page((unsigned long)cb->compressed_pages[page_index]); + + kfree(cb->compressed_pages); +fail1: + kfree(cb); +out: + free_extent_map(em); + return ret; } static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8b8d3d99ae68..6411ed6ca449 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1865,7 +1865,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else submit_bio(rw, bio); @@ -2126,7 +2126,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, &bio_flags); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, 0, bio_flags); return ret; } -- cgit v1.2.1 From 2a29edc6b60a5248ccab588e7ba7dad38cef0235 Mon Sep 17 00:00:00 2001 From: liubo Date: Wed, 26 Jan 2011 06:22:08 +0000 Subject: btrfs: fix several uncheck memory allocations To make btrfs more stable, add several missing necessary memory allocation checks, and when no memory, return proper errno. We've checked that some of those -ENOMEM errors will be returned to userspace, and some will be catched by BUG_ON() in the upper callers, and none will be ignored silently. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/export.c | 2 ++ fs/btrfs/file-item.c | 2 ++ fs/btrfs/file.c | 4 ++++ fs/btrfs/tree-log.c | 25 +++++++++++++++++++++++++ 4 files changed, 33 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 6f0444473594..3220ad1aafc8 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -176,6 +176,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child) int ret; path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = root->root_key.objectid; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a562a250ae77..d0bc72657cd7 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -536,6 +536,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, root = root->fs_info->csum_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f903433f5bdf..65b2424a4116 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -945,6 +945,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } /* generic_write_checks can change our pos */ start_pos = pos; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 054744ac5719..c25a41d86118 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, } dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); + if (!dst_copy || !src_copy) { + btrfs_release_path(root, path); + kfree(dst_copy); + kfree(src_copy); + return -ENOMEM; + } read_extent_buffer(eb, src_copy, src_ptr, item_size); @@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &location); name_len = btrfs_dir_name_len(leaf, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(root, path); @@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log, int match = 0; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); if (ret != 0) goto out; @@ -967,6 +979,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset = (u64)-1; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -1178,6 +1192,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + log_type = btrfs_dir_type(eb, di); read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); @@ -1692,6 +1709,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!next) + return -ENOMEM; if (*level == 1) { wc->process_func(root, next, wc, ptr_gen); @@ -2194,6 +2213,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2594,6 +2616,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); -- cgit v1.2.1 From 333e8105445d4f51101fc3d23199a919d66730b3 Mon Sep 17 00:00:00 2001 From: liubo Date: Wed, 26 Jan 2011 06:22:33 +0000 Subject: btrfs: fix missing break in switch phrase There is a missing break in switch, fix it. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/print-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0d126be22b63..fb2605d998e9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) #else BUG(); #endif + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); -- cgit v1.2.1 From 34d19bada00f4825588b338a8ee193820f9ceeb0 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 24 Jan 2011 19:55:19 +0000 Subject: fs/btrfs/inode.c: Add missing IS_ERR test After the conditional that precedes the following code, inode may be an ERR_PTR value. This can eg result from a memory allocation failure via the call to btrfs_iget, and thus does not imply that root is different than sub_root. Thus, an IS_ERR check is added to ensure that there is no dereference of inode in this case. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r@ identifier f; @@ f(...) { ... return ERR_PTR(...); } @@ identifier r.f, fld; expression x; statement S1,S2; @@ x = f(...) ... when != IS_ERR(x) ( if (IS_ERR(x) ||...) S1 else S2 | *x->fld ) // Signed-off-by: Julia Lawall Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2c9a2f7d5631..2b7d251d6ad1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4137,7 +4137,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } srcu_read_unlock(&root->fs_info->subvol_srcu, index); - if (root != sub_root) { + if (!IS_ERR(inode) && root != sub_root) { down_read(&root->fs_info->cleanup_work_sem); if (!(inode->i_sb->s_flags & MS_RDONLY)) btrfs_orphan_cleanup(sub_root); -- cgit v1.2.1 From 3612b49598c303cfb22a4b609427f829828e2427 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 25 Jan 2011 02:51:38 +0000 Subject: btrfs: fix return value check of btrfs_join_transaction() The error check of btrfs_join_transaction()/btrfs_join_transaction_nolock() is added, and the mistake of the error check in several places is corrected. For more stable Btrfs, I think that we should reduce BUG_ON(). But, I think that long time is necessary for this. So, I propose this patch as a short-term solution. With this patch: - To more stable Btrfs, the part that should be corrected is clarified. - The panic isn't done by the NULL pointer reference etc. (even if BUG_ON() is increased temporarily) - The error code is returned in the place where the error can be easily returned. As a long-term plan: - BUG_ON() is reduced by using the forced-readonly framework, etc. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 5 +++++ fs/btrfs/extent-tree.c | 2 +- fs/btrfs/inode.c | 24 ++++++++++++++++-------- fs/btrfs/ioctl.c | 2 +- fs/btrfs/relocation.c | 26 +++++++++++++++++++++++--- fs/btrfs/transaction.c | 5 +++++ 6 files changed, 51 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2887b8be6fdd..b36eeef19194 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1550,6 +1550,7 @@ static int transaction_kthread(void *arg) spin_unlock(&root->fs_info->new_trans_lock); trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); if (transid == trans->transid) { ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); @@ -2464,10 +2465,14 @@ int btrfs_commit_super(struct btrfs_root *root) up_write(&root->fs_info->cleanup_work_sem); trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bcf303204f7f..98ee139885cc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7478,7 +7478,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) BUG_ON(reloc_root->commit_root != NULL); while (1) { trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); mutex_lock(&root->fs_info->drop_mutex); ret = btrfs_drop_snapshot(trans, reloc_root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2b7d251d6ad1..40fee137dd11 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -416,7 +416,7 @@ again: } if (start == 0) { trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -612,6 +612,7 @@ retry: GFP_NOFS); trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, @@ -771,7 +772,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(root == root->fs_info->tree_root); trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1049,7 +1050,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, } else { trans = btrfs_join_transaction(root, 1); } - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); cow_start = (u64)-1; cur_offset = start; @@ -1704,7 +1705,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); @@ -1721,6 +1722,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -2382,6 +2384,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) if (root->orphan_block_rsv || root->orphan_item_inserted) { trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_end_transaction(trans, root); } @@ -4350,6 +4353,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); if (nolock) ret = btrfs_end_transaction_nolock(trans, root); @@ -4375,6 +4380,7 @@ void btrfs_dirty_inode(struct inode *inode) return; trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); @@ -5179,6 +5185,8 @@ again: em = NULL; btrfs_release_path(root, path); trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return ERR_CAST(trans); goto again; } map = kmap(page); @@ -5283,8 +5291,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + len - 1, 0); trans = btrfs_join_transaction(root, 0); - if (!trans) - return ERR_PTR(-ENOMEM); + if (IS_ERR(trans)) + return ERR_CAST(trans); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -5508,7 +5516,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * while we look for nocow cross refs */ trans = btrfs_join_transaction(root, 0); - if (!trans) + if (IS_ERR(trans)) goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { @@ -5643,7 +5651,7 @@ again: BUG_ON(!ordered); trans = btrfs_join_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { err = -ENOMEM; goto out; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index edd82becbb9e..04b4fb9144a9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -203,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 045c9c2b2d7e..ea9965430241 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2147,6 +2147,12 @@ again: } trans = btrfs_join_transaction(rc->extent_root, 1); + if (IS_ERR(trans)) { + if (!err) + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + return PTR_ERR(trans); + } if (!err) { if (num_bytes != rc->merging_rsv_size) { @@ -3222,6 +3228,7 @@ truncate: trans = btrfs_join_transaction(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); + ret = PTR_ERR(trans); goto out; } @@ -3628,6 +3635,7 @@ int prepare_to_relocate(struct reloc_control *rc) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); + BUG_ON(IS_ERR(trans)); btrfs_commit_transaction(trans, rc->extent_root); return 0; } @@ -3804,7 +3812,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); btrfs_free_path(path); @@ -4125,6 +4136,11 @@ int btrfs_recover_relocation(struct btrfs_root *root) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + err = PTR_ERR(trans); + goto out_free; + } rc->merge_reloc_tree = 1; @@ -4154,9 +4170,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); -out: + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); +out_free: kfree(rc); +out: while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bae5c7b8bbe2..3d73c8d93bbb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1161,6 +1161,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, INIT_DELAYED_WORK(&ac->work, do_async_commit); ac->root = root; ac->newtrans = btrfs_join_transaction(root, 0); + if (IS_ERR(ac->newtrans)) { + int err = PTR_ERR(ac->newtrans); + kfree(ac); + return err; + } /* take transaction reference */ mutex_lock(&root->fs_info->trans_mutex); -- cgit v1.2.1 From abd30bb0af9d4671506502278e8631bed9e3c35c Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 24 Jan 2011 00:57:10 +0000 Subject: btrfs: check return value of btrfs_start_ioctl_transaction() properly btrfs_start_ioctl_transaction() returns ERR_PTR(), not NULL. So, it is necessary to use IS_ERR() to check the return value. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 04b4fb9144a9..12dabe28cf54 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2085,7 +2085,7 @@ static long btrfs_ioctl_trans_start(struct file *file) ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (!trans) + if (IS_ERR(trans)) goto out_drop; file->private_data = trans; -- cgit v1.2.1 From dedefd7215d3ec451291ca393e5c8e4c1882c8c6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Jan 2011 21:43:18 +0000 Subject: Btrfs: fix check_path_shared so it returns the right value When running xfstests 224 I kept getting ENOSPC when trying to remove the files, and this is because we were returning ret from check_path_shared while it was uninitalized, which isn't right. Fix this to return 0 properly, and now xfstests 224 doesn't freak out when it tries to clean itself up. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 40fee137dd11..5621818921f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2718,9 +2718,10 @@ static int check_path_shared(struct btrfs_root *root, struct extent_buffer *eb; int level; u64 refs = 1; - int uninitialized_var(ret); for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + int ret; + if (!path->nodes[level]) break; eb = path->nodes[level]; @@ -2731,7 +2732,7 @@ static int check_path_shared(struct btrfs_root *root, if (refs > 1) return 1; } - return ret; /* XXX callers? */ + return 0; } /* -- cgit v1.2.1 From e9e22899de661af94cb9995885fd04e4c738838b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Jan 2011 21:43:19 +0000 Subject: Btrfs: do not release more reserved bytes to the global_block_rsv than we need When we do btrfs_block_rsv_release, if global_block_rsv is not full we will release all the extra bytes to global_block_rsv, even if it's only a little short of the amount of space that we need to reserve. This causes us to starve ourselves of reservable space during the transaction which will force us to shrink delalloc bytes and commit the transaction more often than we should. So instead just add the amount of bytes we need to add to the global reserve so reserved == size, and then add the rest back into the space_info for general use. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 98ee139885cc..7af618dcf2c0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3589,8 +3589,20 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes > 0) { if (dest) { - block_rsv_add_bytes(dest, num_bytes, 0); - } else { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_reserved -= num_bytes; spin_unlock(&space_info->lock); -- cgit v1.2.1 From 68a82277b8619e6d0f2738b1d9b160b627e81e92 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Jan 2011 21:43:20 +0000 Subject: Btrfs: use the global block reserve if we cannot reserve space We call use_block_rsv right before we make an allocation in order to make sure we have enough space. Now normally people have called btrfs_start_transaction() with the appropriate amount of space that we need, so we just use some of that pre-reserved space and move along happily. The problem is where people use btrfs_join_transaction(), which doesn't actually reserve any space. So we try and reserve space here, but we cannot flush delalloc, so this forces us to return -ENOSPC when in reality we have plenty of space. The most common symptom is seeing a bunch of "couldn't dirty inode" messages in syslog. With xfstests 224 we end up falling back to start_transaction and then doing all the flush delalloc stuff which causes to hang for a very long time. So instead steal from the global reserve, which is what this is meant for anyway. With this patch and the other 2 I have sent xfstests 224 now passes successfully. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7af618dcf2c0..ff6bbfd75cf7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5646,6 +5646,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize) { struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; block_rsv = get_block_rsv(trans, root); @@ -5653,14 +5654,39 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (block_rsv->size == 0) { ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 0); - if (ret) + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve. + */ + if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + return ERR_PTR(ret); + } else if (ret) { return ERR_PTR(ret); + } return block_rsv; } ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; + if (ret) { + WARN_ON(1); + ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, + 0); + if (!ret) { + spin_lock(&block_rsv->lock); + block_rsv->size += blocksize; + spin_unlock(&block_rsv->lock); + return block_rsv; + } else if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + } return ERR_PTR(-ENOSPC); } -- cgit v1.2.1 From ad0397a7a97f55fd7f70998ec208c5d8b90310ff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Jan 2011 18:44:44 +0000 Subject: Btrfs: do error checking in btrfs_del_csums Got a report of a box panicing because we got a NULL eb in read_extent_buffer. His fs was borked and btrfs_search_path returned EIO, but we don't check for errors so the box paniced. Yes I know this will just make something higher up the stack panic, but that's a problem for future Josef. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file-item.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d0bc72657cd7..4f19a3e1bf32 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -550,7 +550,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, if (path->slots[0] == 0) goto out; path->slots[0]--; + } else if (ret < 0) { + goto out; } + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); -- cgit v1.2.1 From 7adf5dfbb3af65a00e20b3ead224c3a1b40e4ec4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Jan 2011 22:11:54 +0000 Subject: Btrfs: handle no memory properly in prepare_pages Instead of doing a BUG_ON(1) in prepare_pages if grab_cache_page() fails, just loop through the pages we've already grabbed and unlock and release them, then return -ENOMEM like we should. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 65b2424a4116..9e097fbfc78d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -792,8 +792,12 @@ again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { - err = -ENOMEM; - BUG_ON(1); + int c; + for (c = i - 1; c >= 0; c--) { + unlock_page(pages[c]); + page_cache_release(pages[c]); + } + return -ENOMEM; } wait_on_page_writeback(pages[i]); } -- cgit v1.2.1 From af5eb745efe97d91d2cbe793029838b3311c15da Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Fri, 28 Jan 2011 20:45:28 +0000 Subject: NTFS: Fix invalid pointer dereference in ntfs_mft_record_alloc(). In ntfs_mft_record_alloc() when mapping the new extent mft record with map_extent_mft_record() we overwrite @m with the return value and on error, we then try to use the old @m but that is no longer there as @m now contains an error code instead so we crash when dereferencing the error code as if it were a pointer. The simple fix is to use a temporary variable to store the return value thus preserving the original @m for later use. This is a backport from the commercial Tuxera-NTFS driver and is well tested... Thanks go to Julia Lawall for pointing this out (whilst I had fixed it in the commercial driver I had failed to fix it in the Linux kernel). Signed-off-by: Anton Altaparmakov Signed-off-by: Linus Torvalds --- fs/ntfs/mft.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index b572b6727181..326e7475a22a 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,7 +1,7 @@ /** * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2006 Anton Altaparmakov + * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -2576,6 +2576,8 @@ mft_rec_already_initialized: flush_dcache_page(page); SetPageUptodate(page); if (base_ni) { + MFT_RECORD *m_tmp; + /* * Setup the base mft record in the extent mft record. This * completes initialization of the allocated extent mft record @@ -2588,11 +2590,11 @@ mft_rec_already_initialized: * attach it to the base inode @base_ni and map, pin, and lock * its, i.e. the allocated, mft record. */ - m = map_extent_mft_record(base_ni, bit, &ni); - if (IS_ERR(m)) { + m_tmp = map_extent_mft_record(base_ni, bit, &ni); + if (IS_ERR(m_tmp)) { ntfs_error(vol->sb, "Failed to map allocated extent " "mft record 0x%llx.", (long long)bit); - err = PTR_ERR(m); + err = PTR_ERR(m_tmp); /* Set the mft record itself not in use. */ m->flags &= cpu_to_le16( ~le16_to_cpu(MFT_RECORD_IN_USE)); @@ -2603,6 +2605,7 @@ mft_rec_already_initialized: ntfs_unmap_page(page); goto undo_mftbmp_alloc; } + BUG_ON(m != m_tmp); /* * Make sure the allocated mft record is written out to disk. * No need to set the inode dirty because the caller is going -- cgit v1.2.1 From ffeb414a59291d5891f09727beb793c109f19f08 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 29 Jan 2011 07:03:02 -0500 Subject: cifs: fix two compiler warning about uninitialized vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs/cifs/link.c: In function ‘symlink_hash’: fs/cifs/link.c:58:3: warning: ‘rc’ may be used uninitialized in this function [-Wuninitialized] fs/cifs/smbencrypt.c: In function ‘mdfour’: fs/cifs/smbencrypt.c:61:3: warning: ‘rc’ may be used uninitialized in this function [-Wuninitialized] Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/link.c | 3 ++- fs/cifs/smbencrypt.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 02cd60aefbff..e8804d373404 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -55,8 +55,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) md5 = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(md5)) { + rc = PTR_ERR(md5); cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc); - return PTR_ERR(md5); + return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); sdescmd5 = kmalloc(size, GFP_KERNEL); diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index b5450e9f40c0..b5041c849981 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -58,8 +58,9 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) md4 = crypto_alloc_shash("md4", 0, 0); if (IS_ERR(md4)) { + rc = PTR_ERR(md4); cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc); - return PTR_ERR(md4); + return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); sdescmd4 = kmalloc(size, GFP_KERNEL); -- cgit v1.2.1 From 1be912dde772b77aaaa21770eeabb0a7a5e297a6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Jan 2011 07:08:28 -0500 Subject: cifs: handle cancelled requests better Currently, when a request is cancelled via signal, we delete the mid immediately. If the request was already transmitted however, the client is still likely to receive a response. When it does, it won't recognize it however and will pop a printk. It's also a little dangerous to just delete the mid entry like this. We may end up reusing that mid. If we do then we could potentially get the response from the first request confused with the later one. Prevent the reuse of mids by marking them as cancelled and keeping them on the pending_mid_q list. If the reply comes in, we'll delete it from the list then. If it never comes, then we'll delete it at reconnect or when cifsd comes down. Reviewed-by: Pavel Shilovsky Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/transport.c | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c1ccca1a933f..9b2d0373a8a7 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -579,8 +579,17 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, goto out; rc = wait_for_response(ses->server, midQ); - if (rc != 0) - goto out; + if (rc != 0) { + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } rc = sync_mid_result(midQ, ses->server); if (rc != 0) { @@ -724,8 +733,18 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, goto out; rc = wait_for_response(ses->server, midQ); - if (rc != 0) - goto out; + if (rc != 0) { + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + /* no longer considered to be "in-flight" */ + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } rc = sync_mid_result(midQ, ses->server); if (rc != 0) { @@ -922,10 +941,20 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, } } - if (wait_for_response(ses->server, midQ) == 0) { - /* We got the response - restart system call. */ - rstart = 1; + rc = wait_for_response(ses->server, midQ); + if (rc) { + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + /* no longer considered to be "in-flight" */ + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + return rc; + } + spin_unlock(&GlobalMid_Lock); } + + /* We got the response - restart system call. */ + rstart = 1; } rc = sync_mid_result(midQ, ses->server); -- cgit v1.2.1 From 2db7c5815555d8daabf7d4ab1253ce690852c140 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Jan 2011 07:08:28 -0500 Subject: cifs: send an NT_CANCEL request when a process is signalled Use the new send_nt_cancel function to send an NT_CANCEL when the process is delivered a fatal signal. This is a "best effort" enterprise however, so don't bother to check the return code. There's nothing we can reasonably do if it fails anyway. Reviewed-by: Pavel Shilovsky Reviewed-by: Suresh Jayaraman Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/transport.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9b2d0373a8a7..bdaa4aa58b03 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -570,20 +570,25 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, #endif mutex_unlock(&ses->server->srv_mutex); - cifs_small_buf_release(in_buf); - if (rc < 0) + if (rc < 0) { + cifs_small_buf_release(in_buf); goto out; + } - if (long_op == CIFS_ASYNC_OP) + if (long_op == CIFS_ASYNC_OP) { + cifs_small_buf_release(in_buf); goto out; + } rc = wait_for_response(ses->server, midQ); if (rc != 0) { + send_nt_cancel(ses->server, in_buf, midQ); spin_lock(&GlobalMid_Lock); if (midQ->midState == MID_REQUEST_SUBMITTED) { midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); + cifs_small_buf_release(in_buf); atomic_dec(&ses->server->inFlight); wake_up(&ses->server->request_q); return rc; @@ -591,6 +596,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, spin_unlock(&GlobalMid_Lock); } + cifs_small_buf_release(in_buf); + rc = sync_mid_result(midQ, ses->server); if (rc != 0) { atomic_dec(&ses->server->inFlight); @@ -734,6 +741,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, rc = wait_for_response(ses->server, midQ); if (rc != 0) { + send_nt_cancel(ses->server, in_buf, midQ); spin_lock(&GlobalMid_Lock); if (midQ->midState == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ @@ -943,6 +951,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, rc = wait_for_response(ses->server, midQ); if (rc) { + send_nt_cancel(ses->server, in_buf, midQ); spin_lock(&GlobalMid_Lock); if (midQ->midState == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ -- cgit v1.2.1 From 68abaffa6bbd3cadfaa4b7216d10bcd32406090b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Jan 2011 15:05:42 -0500 Subject: cifs: simplify SMB header check routine ...just cleanup. There should be no behavior change. Signed-off-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/misc.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index a09e077ba925..72e99ece78cf 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -381,29 +381,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , } static int -checkSMBhdr(struct smb_hdr *smb, __u16 mid) +check_smb_hdr(struct smb_hdr *smb, __u16 mid) { - /* Make sure that this really is an SMB, that it is a response, - and that the message ids match */ - if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) && - (mid == smb->Mid)) { - if (smb->Flags & SMBFLG_RESPONSE) - return 0; - else { - /* only one valid case where server sends us request */ - if (smb->Command == SMB_COM_LOCKING_ANDX) - return 0; - else - cERROR(1, "Received Request not response"); - } - } else { /* bad signature or mid */ - if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) - cERROR(1, "Bad protocol string signature header %x", - *(unsigned int *) smb->Protocol); - if (mid != smb->Mid) - cERROR(1, "Mids do not match"); + /* does it have the right SMB "signature" ? */ + if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) { + cERROR(1, "Bad protocol string signature header 0x%x", + *(unsigned int *)smb->Protocol); + return 1; } - cERROR(1, "bad smb detected. The Mid=%d", smb->Mid); + + /* Make sure that message ids match */ + if (mid != smb->Mid) { + cERROR(1, "Mids do not match. received=%u expected=%u", + smb->Mid, mid); + return 1; + } + + /* if it's a response then accept */ + if (smb->Flags & SMBFLG_RESPONSE) + return 0; + + /* only one valid case where server sends us request */ + if (smb->Command == SMB_COM_LOCKING_ANDX) + return 0; + + cERROR(1, "Server sent request, not response. mid=%u", smb->Mid); return 1; } @@ -448,7 +450,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) return 1; } - if (checkSMBhdr(smb, mid)) + if (check_smb_hdr(smb, mid)) return 1; clc_len = smbCalcSize_LE(smb); -- cgit v1.2.1 From d804d41d163c0975d2890c82d7135ada7a2f23a4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Jan 2011 15:05:43 -0500 Subject: cifs: don't pop a printk when sending on a socket is interrupted If we kill the process while it's sending on a socket then the kernel_sendmsg will return -EINTR. This is normal. No need to spam the ring buffer with this info. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index bdaa4aa58b03..b8c5e2eb43d0 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -236,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) server->tcpStatus = CifsNeedReconnect; } - if (rc < 0) { + if (rc < 0 && rc != -EINTR) cERROR(1, "Error %d sending data on socket to server", rc); - } else + else rc = 0; /* Don't want to modify the buffer as a -- cgit v1.2.1 From 92a4e0f0169498867ecb19c2244510dd4beba149 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 29 Jan 2011 07:02:28 -0500 Subject: cifs: force a reconnect if there are too many MIDs in flight Currently, we allow the pending_mid_q to grow without bound with SIGKILL'ed processes. This could eventually be a DoS'able problem. An unprivileged user could a process that does a long-running call and then SIGKILL it. If he can also intercept the NT_CANCEL calls or the replies from the server, then the pending_mid_q could grow very large, possibly even to 2^16 entries which might leave GetNextMid in an infinite loop. Fix this by imposing a hard limit of 32k calls per server. If we cross that limit, set the tcpStatus to CifsNeedReconnect to force cifsd to eventually reconnect the socket and clean out the pending_mid_q. While we're at it, clean up the function a bit and eliminate an unnecessary NULL pointer check. Signed-off-by: Jeff Layton Reviewed-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/misc.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 72e99ece78cf..24f0a9d97ad8 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server) { __u16 mid = 0; __u16 last_mid; - int collision; - - if (server == NULL) - return mid; + bool collision; spin_lock(&GlobalMid_Lock); last_mid = server->CurrentMid; /* we do not want to loop forever */ @@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server) (and it would also have to have been a request that did not time out) */ while (server->CurrentMid != last_mid) { - struct list_head *tmp; struct mid_q_entry *mid_entry; + unsigned int num_mids; - collision = 0; + collision = false; if (server->CurrentMid == 0) server->CurrentMid++; - list_for_each(tmp, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - - if ((mid_entry->mid == server->CurrentMid) && - (mid_entry->midState == MID_REQUEST_SUBMITTED)) { + num_mids = 0; + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { + ++num_mids; + if (mid_entry->mid == server->CurrentMid && + mid_entry->midState == MID_REQUEST_SUBMITTED) { /* This mid is in use, try a different one */ - collision = 1; + collision = true; break; } } - if (collision == 0) { + + /* + * if we have more than 32k mids in the list, then something + * is very wrong. Possibly a local user is trying to DoS the + * box by issuing long-running calls and SIGKILL'ing them. If + * we get to 2^16 mids then we're in big trouble as this + * function could loop forever. + * + * Go ahead and assign out the mid in this situation, but force + * an eventual reconnect to clean out the pending_mid_q. + */ + if (num_mids > 32768) + server->tcpStatus = CifsNeedReconnect; + + if (!collision) { mid = server->CurrentMid; break; } -- cgit v1.2.1 From f855f6cbeb4f94cd4e4a225c2246ee8012c384a2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 31 Jan 2011 08:41:36 -0500 Subject: cifs: make CIFS depend on CRYPTO_MD4 Recently CIFS was changed to use the kernel crypto API for MD4 hashes, but the Kconfig dependencies were not changed to reflect this. Signed-off-by: Jeff Layton Reported-and-Tested-by: Suresh Jayaraman Reviewed-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index ee45648b0d1a..7cb0f7f847e4 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -3,6 +3,7 @@ config CIFS depends on INET select NLS select CRYPTO + select CRYPTO_MD4 select CRYPTO_MD5 select CRYPTO_HMAC select CRYPTO_ARC4 -- cgit v1.2.1 From 31c2659d78c8be970833bc1e633593d291553ed3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 31 Jan 2011 07:24:46 -0500 Subject: cifs: clean up some compiler warnings New compiler warnings that I noticed when building a patchset based on recent Fedora kernel: fs/cifs/cifssmb.c: In function 'CIFSSMBSetFileSize': fs/cifs/cifssmb.c:4813:8: warning: variable 'data_offset' set but not used [-Wunused-but-set-variable] fs/cifs/file.c: In function 'cifs_open': fs/cifs/file.c:349:24: warning: variable 'pCifsInode' set but not used [-Wunused-but-set-variable] fs/cifs/file.c: In function 'cifs_partialpagewrite': fs/cifs/file.c:1149:23: warning: variable 'cifs_sb' set but not used [-Wunused-but-set-variable] fs/cifs/file.c: In function 'cifs_iovec_write': fs/cifs/file.c:1740:9: warning: passing argument 6 of 'CIFSSMBWrite2' from incompatible pointer type [enabled by default] fs/cifs/cifsproto.h:337:12: note: expected 'unsigned int *' but argument is of type 'size_t *' fs/cifs/readdir.c: In function 'cifs_readdir': fs/cifs/readdir.c:767:23: warning: variable 'cifs_sb' set but not used [-Wunused-but-set-variable] fs/cifs/cifs_dfs_ref.c: In function 'cifs_dfs_d_automount': fs/cifs/cifs_dfs_ref.c:342:2: warning: 'rc' may be used uninitialized in this function [-Wuninitialized] fs/cifs/cifs_dfs_ref.c:278:6: note: 'rc' was declared here Signed-off-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 9 ++++----- fs/cifs/cifssmb.c | 3 --- fs/cifs/file.c | 8 ++------ fs/cifs/readdir.c | 3 --- 4 files changed, 6 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index f1c68629f277..0a265ad9e426 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -282,8 +282,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) cFYI(1, "in %s", __func__); BUG_ON(IS_ROOT(mntpt)); - xid = GetXid(); - /* * The MSDFS spec states that paths in DFS referral requests and * responses must be prefixed by a single '\' character instead of @@ -293,7 +291,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(-ENOMEM); full_path = build_path_from_dentry(mntpt); if (full_path == NULL) - goto free_xid; + goto cdda_exit; cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); @@ -303,9 +301,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) } ses = tlink_tcon(tlink)->ses; + xid = GetXid(); rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, &num_referrals, &referrals, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + FreeXid(xid); cifs_put_tlink(tlink); @@ -338,8 +338,7 @@ success: free_dfs_info_array(referrals, num_referrals); free_full_path: kfree(full_path); -free_xid: - FreeXid(xid); +cdda_exit: cFYI(1, "leaving %s" , __func__); return mnt; } diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 3106f5e5c633..46c66ed01af4 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4914,7 +4914,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size, __u16 fid, __u32 pid_of_opener, bool SetAllocation) { struct smb_com_transaction2_sfi_req *pSMB = NULL; - char *data_offset; struct file_end_of_file_info *parm_data; int rc = 0; __u16 params, param_offset, offset, byte_count, count; @@ -4938,8 +4937,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size, param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; offset = param_offset + params; - data_offset = (char *) (&pSMB->hdr.Protocol) + offset; - count = sizeof(struct file_end_of_file_info); pSMB->MaxParameterCount = cpu_to_le16(2); /* BB find exact max SMB PDU from sess structure BB */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0de17c1db608..74c0a282d012 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -346,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file) struct cifsTconInfo *tcon; struct tcon_link *tlink; struct cifsFileInfo *pCifsFile = NULL; - struct cifsInodeInfo *pCifsInode; char *full_path = NULL; bool posix_open_ok = false; __u16 netfid; @@ -361,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file) } tcon = tlink_tcon(tlink); - pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { rc = -ENOMEM; @@ -1146,7 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) char *write_data; int rc = -EFAULT; int bytes_written = 0; - struct cifs_sb_info *cifs_sb; struct inode *inode; struct cifsFileInfo *open_file; @@ -1154,7 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return -EFAULT; inode = page->mapping->host; - cifs_sb = CIFS_SB(inode->i_sb); offset += (loff_t)from; write_data = kmap(page); @@ -1667,7 +1662,8 @@ static ssize_t cifs_iovec_write(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *poffset) { - size_t total_written = 0, written = 0; + size_t total_written = 0; + unsigned int written = 0; unsigned long num_pages, npages; size_t copied, len, cur_len, i; struct kvec *to_send; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 7f25cc3d2256..f8e4cd2a7912 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) { int rc = 0; int xid, i; - struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; struct cifsFileInfo *cifsFile = NULL; char *current_entry; @@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) xid = GetXid(); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - /* * Ensure FindFirst doesn't fail before doing filldir() for '.' and * '..'. Otherwise we won't be able to notify VFS in case of failure. -- cgit v1.2.1 From 7a8587e7c8e4e32ba778bfbbb822a0a7e8d5f3e3 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Sat, 29 Jan 2011 13:54:58 -0600 Subject: cifs: No need to check crypto blockcipher allocation Missed one change as per earlier suggestion. Signed-off-by: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 0db5f1de0227..a51585f9852b 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -657,9 +657,10 @@ calc_seckey(struct cifsSesInfo *ses) get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); - if (!tfm_arc4 || IS_ERR(tfm_arc4)) { + if (IS_ERR(tfm_arc4)) { + rc = PTR_ERR(tfm_arc4); cERROR(1, "could not allocate crypto API arc4\n"); - return PTR_ERR(tfm_arc4); + return rc; } desc.tfm = tfm_arc4; -- cgit v1.2.1 From b1953bcec95c189b1eea690a08e89646d7750bda Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Jan 2011 21:10:01 +0000 Subject: Btrfs: make shrink_delalloc a little friendlier Xfstests 224 will just sit there and spin for ever until eventually we give up flushing delalloc and exit. On my box this took several hours. I could not interrupt this process either, even though we use INTERRUPTIBLE. So do 2 things 1) Keep us from looping over and over again without reclaiming anything 2) If we get interrupted exit the loop I tested this and the test now exits in a reasonable amount of time, and can be interrupted with ctrl+c. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ff6bbfd75cf7..f96641a93fc9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3345,8 +3345,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 reserved; u64 max_reclaim; u64 reclaimed = 0; + long time_left; int pause = 1; int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + int loops = 0; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; @@ -3359,7 +3361,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, max_reclaim = min(reserved, to_reclaim); - while (1) { + while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); nr_pages = min_t(unsigned long, nr_pages, @@ -3367,8 +3369,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) + if (reserved > space_info->bytes_reserved) { + loops = 0; reclaimed += reserved - space_info->bytes_reserved; + } else { + loops++; + } reserved = space_info->bytes_reserved; spin_unlock(&space_info->lock); @@ -3379,7 +3385,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return -EAGAIN; __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(pause); + time_left = schedule_timeout(pause); + + /* We were interrupted, exit */ + if (time_left) + break; + pause <<= 1; if (pause > HZ / 10) pause = HZ / 10; -- cgit v1.2.1 From b31eabd86eb68d3c217e6821078249bc045e698a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 31 Jan 2011 16:48:24 -0500 Subject: Btrfs: catch errors from btrfs_sync_log btrfs_sync_log returns -EAGAIN when we need full transaction commits instead of small log commits, but sometimes we were dropping the return value. In practice, we check for this a few different ways, but this is still a bug that can leave off full log commits when we really need them. Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c25a41d86118..42dfc3077040 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2051,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); + ret = 0; goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); @@ -2115,7 +2116,7 @@ out: smp_mb(); if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); - return 0; + return ret; } static void free_log_tree(struct btrfs_trans_handle *trans, -- cgit v1.2.1 From cab6958da0094e36a098751f844409fc9ee26251 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 31 Jan 2011 21:56:35 +0000 Subject: [CIFS] Update cifs minor version Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 14789a97304e..4a3330235d55 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.69" +#define CIFS_VERSION "1.70" #endif /* _CIFSFS_H */ -- cgit v1.2.1 From 6284644e8de1f4005166c918c3d2aa4c510ab9f6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 31 Jan 2011 09:14:17 -0500 Subject: cifs: fix length checks in checkSMB The cERROR message in checkSMB when the calculated length doesn't match the RFC1001 length is incorrect in many cases. It always says that the RFC1001 length is bigger than the SMB, even when it's actually the reverse. Fix the error message to say the reverse of what it does now when the SMB length goes beyond the end of the received data. Also, clarify the error message when the RFC length is too big. Finally, clarify the comments to show that the 512 byte limit on extra data at the end of the packet is arbitrary. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/misc.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 24f0a9d97ad8..2a930a752a78 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -478,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) return 0; /* bcc wrapped */ } - cFYI(1, "Calculated size %d vs length %d mismatch for mid %d", + cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", clc_len, 4 + len, smb->Mid); - /* Windows XP can return a few bytes too much, presumably - an illegal pad, at the end of byte range lock responses - so we allow for that three byte pad, as long as actual - received length is as long or longer than calculated length */ - /* We have now had to extend this more, since there is a - case in which it needs to be bigger still to handle a - malformed response to transact2 findfirst from WinXP when - access denied is returned and thus bcc and wct are zero - but server says length is 0x21 bytes too long as if the server - forget to reset the smb rfc1001 length when it reset the - wct and bcc to minimum size and drop the t2 parms and data */ - if ((4+len > clc_len) && (len <= clc_len + 512)) - return 0; - else { - cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d", + + if (4 + len < clc_len) { + cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", len, smb->Mid); return 1; + } else if (len > clc_len + 512) { + /* + * Some servers (Windows XP in particular) send more + * data than the lengths in the SMB packet would + * indicate on certain calls (byte range locks and + * trans2 find first calls in particular). While the + * client can handle such a frame by ignoring the + * trailing data, we choose limit the amount of extra + * data to 512 bytes. + */ + cERROR(1, "RFC1001 size %u more than 512 bytes larger " + "than SMB for mid=%u", len, smb->Mid); + return 1; } } return 0; -- cgit v1.2.1 From c87fb6fdcaf7560940b31a0c78c3e6370e3433cf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 31 Jan 2011 19:54:59 -0500 Subject: Btrfs: avoid uninit variable warnings in ordered-data.c This one isn't really an uninit variable, but for pretty obscure reasons. Let's make it clearly correct. Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2b61e1ddcd99..083a55477375 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, u64 file_offset) { struct rb_root *root = &tree->tree; - struct rb_node *prev; + struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; -- cgit v1.2.1 From 5df67083488ccbad925f583b698ab38f8629a016 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 1 Feb 2011 09:17:35 +0000 Subject: btrfs: checking NULL or not in some functions Because NULL is returned when the memory allocation fails, it is checked whether it is NULL. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 ++ fs/btrfs/extent_io.c | 2 ++ fs/btrfs/tree-log.c | 6 ++++++ 3 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f96641a93fc9..9de4ff03882a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6496,6 +6496,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start, int ret = 0; ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; mutex_lock(&inode->i_mutex); first_index = start >> PAGE_CACHE_SHIFT; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6411ed6ca449..8862dda46ff6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1920,6 +1920,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, nr = bio_get_nr_vecs(bdev); bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) + return -ENOMEM; bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 42dfc3077040..6d66e5caff97 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2751,7 +2751,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; dst_path = btrfs_alloc_path(); + if (!dst_path) { + btrfs_free_path(path); + return -ENOMEM; + } min_key.objectid = inode->i_ino; min_key.type = BTRFS_INODE_ITEM_KEY; -- cgit v1.2.1 From 98d5dc13e7e74b77ca3b4c3cbded9f48d2dbbbb7 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 20 Jan 2011 06:19:37 +0000 Subject: btrfs: fix return value check of btrfs_start_transaction() The error check of btrfs_start_transaction() is added, and the mistake of the error check on several places is corrected. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 7 +++++-- fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 10 ++++++++-- fs/btrfs/relocation.c | 3 +++ fs/btrfs/super.c | 2 ++ fs/btrfs/tree-log.c | 1 + fs/btrfs/volumes.c | 19 +++++++++++++++++-- 7 files changed, 37 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9de4ff03882a..f07ba21cbf06 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6271,6 +6271,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, BUG_ON(!wc); trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); + if (block_rsv) trans->block_rsv = block_rsv; @@ -6368,6 +6370,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_end_transaction_throttle(trans, tree_root); trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); if (block_rsv) trans->block_rsv = block_rsv; } @@ -7587,7 +7590,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root) if (found) { trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); } @@ -7831,7 +7834,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, trans = btrfs_start_transaction(extent_root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); if (extent_key->objectid == 0) { ret = del_extent_zero(trans, extent_root, path, extent_key); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5621818921f8..36bc3f49ebf9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2357,6 +2357,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) */ if (is_bad_inode(inode)) { trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 12dabe28cf54..02d224e8c83f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -907,6 +907,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); } else { @@ -2141,9 +2145,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { btrfs_free_path(path); - return -ENOMEM; + return PTR_ERR(trans); } dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); @@ -2337,6 +2341,8 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp u64 transid; trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); transid = trans->transid; btrfs_commit_transaction_async(trans, root, 0); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ea9965430241..1f5556acb530 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2028,6 +2028,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, while (1) { trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, @@ -3665,6 +3666,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) while (1) { trans = btrfs_start_transaction(rc->extent_root, 0); + BUG_ON(IS_ERR(trans)); if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); @@ -4033,6 +4035,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) int ret; trans = btrfs_start_transaction(root->fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f4e45fdded30..0209b5fc772c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -623,6 +623,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6d66e5caff97..a4bbb854dfd2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3112,6 +3112,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) BUG_ON(!path); trans = btrfs_start_transaction(fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); wc.trans = trans; wc.pin = 1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2d2f4ccc738..7cad59353b09 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1212,6 +1212,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, return -ENOMEM; trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1604,6 +1608,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + kfree(device); + ret = PTR_ERR(trans); + goto error; + } + lock_chunks(root); device->barriers = 1; @@ -1872,7 +1882,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return ret; trans = btrfs_start_transaction(root, 0); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); lock_chunks(root); @@ -2046,7 +2056,7 @@ int btrfs_balance(struct btrfs_root *dev_root) BUG_ON(ret); trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_grow_device(trans, device, old_size); BUG_ON(ret); @@ -2212,6 +2222,11 @@ again: /* Shrinking succeeded, else we would be at "done". */ trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto done; + } + lock_chunks(root); device->disk_total_bytes = new_size; -- cgit v1.2.1 From 9587fcff42f5bece3c0a44066b079235ee73cbb3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 1 Feb 2011 08:40:43 -0500 Subject: cifs: fix length vs. total_read confusion in cifs_demultiplex_thread length at this point is the length returned by the last kernel_recvmsg call. total_read is the length of all of the data read so far. length is more or less meaningless at this point, so use total_read for everything. Signed-off-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/connect.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 47d8ff623683..945b2202275f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -578,12 +578,12 @@ incomplete_rcv: else if (reconnect == 1) continue; - length += 4; /* account for rfc1002 hdr */ + total_read += 4; /* account for rfc1002 hdr */ - - dump_smb(smb_buffer, length); - if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) { - cifs_dump_mem("Bad SMB: ", smb_buffer, 48); + dump_smb(smb_buffer, total_read); + if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) { + cifs_dump_mem("Bad SMB: ", smb_buffer, + total_read < 48 ? total_read : 48); continue; } -- cgit v1.2.1 From 0781b909b5586f4db720b5d1838b78f9d8e42f14 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Feb 2011 15:52:35 -0800 Subject: epoll: epoll_wait() should not use timespec_add_ns() commit 95aac7b1cd224f ("epoll: make epoll_wait() use the hrtimer range feature") added a performance regression because it uses timespec_add_ns() with potential very large 'ns' values. [akpm@linux-foundation.org: s/epoll_set_mstimeout/ep_set_mstimeout/, per Davide] Reported-by: Simon Kirby Signed-off-by: Eric Dumazet Cc: Shawn Bohrer Acked-by: Davide Libenzi Cc: [2.6.37.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cc8a9b7d6064..267d0ada4541 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1114,6 +1114,17 @@ static int ep_send_events(struct eventpoll *ep, return ep_scan_ready_list(ep, ep_send_events_proc, &esed); } +static inline struct timespec ep_set_mstimeout(long ms) +{ + struct timespec now, ts = { + .tv_sec = ms / MSEC_PER_SEC, + .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), + }; + + ktime_get_ts(&now); + return timespec_add_safe(now, ts); +} + static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { @@ -1121,12 +1132,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, unsigned long flags; long slack; wait_queue_t wait; - struct timespec end_time; ktime_t expires, *to = NULL; if (timeout > 0) { - ktime_get_ts(&end_time); - timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); + struct timespec end_time = ep_set_mstimeout(timeout); + slack = select_estimate_accuracy(&end_time); to = &expires; *to = timespec_to_ktime(end_time); -- cgit v1.2.1 From 3cd90ea42f2c15f928b70ed66f6d8ed0a8e7aadd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 1 Feb 2011 15:52:46 -0800 Subject: vfs: sparse: add __FMODE_EXEC FMODE_EXEC is a constant type of fmode_t but was used with normal integer constants. This results in following warnings from sparse. Fix it using new macro __FMODE_EXEC. fs/exec.c:116:58: warning: restricted fmode_t degrades to integer fs/exec.c:689:58: warning: restricted fmode_t degrades to integer fs/fcntl.c:777:9: warning: restricted fmode_t degrades to integer Signed-off-by: Namhyung Kim Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 ++-- fs/fcntl.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index c62efcb959c7..52a447d9b6ab 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -120,7 +120,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) goto out; file = do_filp_open(AT_FDCWD, tmp, - O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, + O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, MAY_READ | MAY_EXEC | MAY_OPEN); putname(tmp); error = PTR_ERR(file); @@ -723,7 +723,7 @@ struct file *open_exec(const char *name) int err; file = do_filp_open(AT_FDCWD, name, - O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, + O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, MAY_EXEC | MAY_OPEN); if (IS_ERR(file)) goto out; diff --git a/fs/fcntl.c b/fs/fcntl.c index ecc8b3954ed6..cb1026181bdc 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -815,7 +815,7 @@ static int __init fcntl_init(void) __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - FMODE_EXEC + __FMODE_EXEC )); fasync_cache = kmem_cache_create("fasync_cache", -- cgit v1.2.1 From d54cdc8ca7aabc69e145a62155855db42b04ed0b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Feb 2011 15:52:47 -0800 Subject: fs: make block fiemap mapping length at least blocksize long Some filesystems don't deal well with being asked to map less than blocksize blocks (GFS2 for example). Since we are always mapping at least blocksize sections anyway, just make sure len is at least as big as a blocksize so we don't trip up any filesystems. Thanks, Signed-off-by: Josef Bacik Cc: Steven Whitehouse Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index a59635e295fa..1eebeb72b202 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode, len = isize; } + /* + * Some filesystems can't deal with being asked to map less than + * blocksize, so make sure our len is at least block length. + */ + if (logical_to_blk(inode, len) == 0) + len = blk_to_logical(inode, 1); + start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); -- cgit v1.2.1 From 0b0abeaf3d30cec03ac6497fe978b8f7edecc5ae Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 2 Feb 2011 21:02:12 +0200 Subject: Revert "exofs: Set i_mapping->backing_dev_info anyway" This reverts commit 115e19c53501edc11f730191f7f047736815ae3d. Apparently setting inode->bdi to one's own sb->s_bdi stops VFS from sending *read-aheads*. This problem was bisected to this commit. A revert fixes it. I'll investigate farther why is this happening for the next Kernel, but for now a revert. I'm sending to stable@kernel.org as well, since it exists also in 2.6.37. 2.6.36 is good and does not have this patch. CC: Stable Tree Signed-off-by: Boaz Harrosh Signed-off-by: Linus Torvalds --- fs/exofs/inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 42685424817b..a7555238c41a 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1030,7 +1030,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); } - inode->i_mapping->backing_dev_info = sb->s_bdi; if (S_ISREG(inode->i_mode)) { inode->i_op = &exofs_file_inode_operations; inode->i_fop = &exofs_file_operations; @@ -1131,7 +1130,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) sbi = sb->s_fs_info; - inode->i_mapping->backing_dev_info = sb->s_bdi; sb->s_dirt = 1; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; -- cgit v1.2.1 From 8f1f745331c1b560f53c0d60e55a4f4f43f7cce5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 3 Feb 2011 14:33:15 -0500 Subject: ext4: fix panic on module unload when stopping lazyinit thread https://bugzilla.kernel.org/show_bug.cgi?id=27652 If the lazyinit thread is running, the teardown function ext4_destroy_lazyinit_thread() has problems: ext4_clear_request_list(); while (ext4_li_info->li_task) { wake_up(&ext4_li_info->li_wait_daemon); wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task == NULL); } Clearing the request list will cause the thread to exit and free ext4_li_info, so then we're waiting on something which is getting freed. Fix this up by making the thread respond to kthread_stop, and exit, without the need to wait for that exit in some other homegrown way. Cc: stable@kernel.org Reported-and-Tested-by: Tao Ma Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 48ce561fafac..3d8cf2cab379 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); +static void ext4_clear_request_list(void); #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { @@ -2716,6 +2717,8 @@ static void ext4_unregister_li_request(struct super_block *sb) mutex_unlock(&ext4_li_info->li_list_mtx); } +static struct task_struct *ext4_lazyinit_task; + /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. @@ -2784,6 +2787,10 @@ cont_thread: if (time_before(jiffies, next_wakeup)) schedule(); finish_wait(&eli->li_wait_daemon, &wait); + if (kthread_should_stop()) { + ext4_clear_request_list(); + goto exit_thread; + } } exit_thread: @@ -2808,6 +2815,7 @@ exit_thread: wake_up(&eli->li_wait_task); kfree(ext4_li_info); + ext4_lazyinit_task = NULL; ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); @@ -2830,11 +2838,10 @@ static void ext4_clear_request_list(void) static int ext4_run_lazyinit_thread(void) { - struct task_struct *t; - - t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); - if (IS_ERR(t)) { - int err = PTR_ERR(t); + ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, + ext4_li_info, "ext4lazyinit"); + if (IS_ERR(ext4_lazyinit_task)) { + int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); del_timer_sync(&ext4_li_info->li_timer); kfree(ext4_li_info); @@ -2985,16 +2992,10 @@ static void ext4_destroy_lazyinit_thread(void) * If thread exited earlier * there's nothing to be done. */ - if (!ext4_li_info) + if (!ext4_li_info || !ext4_lazyinit_task) return; - ext4_clear_request_list(); - - while (ext4_li_info->li_task) { - wake_up(&ext4_li_info->li_wait_daemon); - wait_event(ext4_li_info->li_wait_task, - ext4_li_info->li_task == NULL); - } + kthread_stop(ext4_lazyinit_task); } static int ext4_fill_super(struct super_block *sb, void *data, int silent) -- cgit v1.2.1 From 8f021222c1e2756ea4c9dde93b23e1d2a0a4ec37 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 3 Feb 2011 14:33:33 -0500 Subject: ext4: unregister features interface on module unload Ext4 features interface was not properly unregistered which led to problems while unloading/reloading ext4 module. This commit fixes that by adding proper kobject unregistration code into ext4_exit_fs() as well as fail-path of ext4_init_fs() Reported-by: Eric Sandeen Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/super.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3d8cf2cab379..4898cb1ff606 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4769,7 +4769,7 @@ static struct file_system_type ext4_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -int __init ext4_init_feat_adverts(void) +static int __init ext4_init_feat_adverts(void) { struct ext4_features *ef; int ret = -ENOMEM; @@ -4793,6 +4793,13 @@ out: return ret; } +static void ext4_exit_feat_adverts(void) +{ + kobject_put(&ext4_feat->f_kobj); + wait_for_completion(&ext4_feat->f_kobj_unregister); + kfree(ext4_feat); +} + static int __init ext4_init_fs(void) { int err; @@ -4839,7 +4846,7 @@ out1: out2: ext4_exit_mballoc(); out3: - kfree(ext4_feat); + ext4_exit_feat_adverts(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); out4: @@ -4858,6 +4865,7 @@ static void __exit ext4_exit_fs(void) destroy_inodecache(); ext4_exit_xattr(); ext4_exit_mballoc(); + ext4_exit_feat_adverts(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); ext4_exit_system_zone(); -- cgit v1.2.1 From dd68314ccf3fb918c1fb6471817edbc60ece4b52 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 3 Feb 2011 14:33:49 -0500 Subject: ext4: fix up ext4 error handling Make sure we the correct cleanup happens if we die while trying to load the ext4 file system. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4898cb1ff606..86b05486dc63 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4810,13 +4810,17 @@ static int __init ext4_init_fs(void) return err; err = ext4_init_system_zone(); if (err) - goto out5; + goto out7; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) - goto out4; + goto out6; ext4_proc_root = proc_mkdir("fs/ext4", NULL); + if (!ext4_proc_root) + goto out5; err = ext4_init_feat_adverts(); + if (err) + goto out4; err = ext4_init_mballoc(); if (err) @@ -4847,11 +4851,13 @@ out2: ext4_exit_mballoc(); out3: ext4_exit_feat_adverts(); +out4: remove_proc_entry("fs/ext4", NULL); +out5: kset_unregister(ext4_kset); -out4: +out6: ext4_exit_system_zone(); -out5: +out7: ext4_exit_pageio(); return err; } -- cgit v1.2.1 From c5b8d0bce052949e173b5b32f96bd59bceaa2ab0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Feb 2011 09:32:39 -0700 Subject: hfsplus: fix failed mount handling Currently the error handling in hfsplus_fill_super is a mess, and can lead to accessing fields in the superblock that haven't been even set up yet. Fix this by making sure we do not set up sb->s_root until we have the mount fully set up, and before that do proper step by step unwinding instead of using hfsplus_put_super as a big hammer. Reported-by: Dan Williams Signed-off-by: Christoph Hellwig --- fs/hfsplus/super.c | 106 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 9a3b4795f43c..b49b55584c84 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) struct inode *root, *inode; struct qstr str; struct nls_table *nls = NULL; - int err = -EINVAL; + int err; + err = -EINVAL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - return -ENOMEM; + goto out; sb->s_fs_info = sbi; mutex_init(&sbi->alloc_mutex); mutex_init(&sbi->vh_mutex); hfsplus_fill_defaults(sbi); + + err = -EINVAL; if (!hfsplus_parse_options(data, sbi)) { printk(KERN_ERR "hfs: unable to parse mount options\n"); - err = -EINVAL; - goto cleanup; + goto out_unload_nls; } /* temporarily use utf8 to correctly find the hidden dir below */ @@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sbi->nls = load_nls("utf8"); if (!sbi->nls) { printk(KERN_ERR "hfs: unable to load nls for utf8\n"); - err = -EINVAL; - goto cleanup; + goto out_unload_nls; } /* Grab the volume header */ if (hfsplus_read_wrapper(sb)) { if (!silent) printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); - err = -EINVAL; - goto cleanup; + goto out_unload_nls; } vhdr = sbi->s_vhdr; @@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { printk(KERN_ERR "hfs: wrong filesystem version\n"); - goto cleanup; + goto out_free_vhdr; } sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); @@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); if (!sbi->ext_tree) { printk(KERN_ERR "hfs: failed to load extents file\n"); - goto cleanup; + goto out_free_vhdr; } sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); if (!sbi->cat_tree) { printk(KERN_ERR "hfs: failed to load catalog file\n"); - goto cleanup; + goto out_close_ext_tree; } inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); if (IS_ERR(inode)) { printk(KERN_ERR "hfs: failed to load allocation file\n"); err = PTR_ERR(inode); - goto cleanup; + goto out_close_cat_tree; } sbi->alloc_file = inode; @@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(root)) { printk(KERN_ERR "hfs: failed to load root directory\n"); err = PTR_ERR(root); - goto cleanup; - } - sb->s_d_op = &hfsplus_dentry_operations; - sb->s_root = d_alloc_root(root); - if (!sb->s_root) { - iput(root); - err = -ENOMEM; - goto cleanup; + goto out_put_alloc_file; } str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; @@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) - goto cleanup; + goto out_put_root; inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto cleanup; + goto out_put_root; } sbi->hidden_dir = inode; } else hfs_find_exit(&fd); - if (sb->s_flags & MS_RDONLY) - goto out; + if (!(sb->s_flags & MS_RDONLY)) { + /* + * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused + * all three are registered with Apple for our use + */ + vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); + vhdr->modify_date = hfsp_now2mt(); + be32_add_cpu(&vhdr->write_count, 1); + vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); + vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); + hfsplus_sync_fs(sb, 1); - /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused - * all three are registered with Apple for our use - */ - vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); - vhdr->modify_date = hfsp_now2mt(); - be32_add_cpu(&vhdr->write_count, 1); - vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); - vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); - hfsplus_sync_fs(sb, 1); - - if (!sbi->hidden_dir) { - mutex_lock(&sbi->vh_mutex); - sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); - hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode, - &str, sbi->hidden_dir); - mutex_unlock(&sbi->vh_mutex); - - hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY); + if (!sbi->hidden_dir) { + mutex_lock(&sbi->vh_mutex); + sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, + sbi->hidden_dir); + mutex_unlock(&sbi->vh_mutex); + + hfsplus_mark_inode_dirty(sbi->hidden_dir, + HFSPLUS_I_CAT_DIRTY); + } } -out: + + sb->s_d_op = &hfsplus_dentry_operations; + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + err = -ENOMEM; + goto out_put_hidden_dir; + } + unload_nls(sbi->nls); sbi->nls = nls; return 0; -cleanup: - hfsplus_put_super(sb); +out_put_hidden_dir: + iput(sbi->hidden_dir); +out_put_root: + iput(sbi->alloc_file); +out_put_alloc_file: + iput(sbi->alloc_file); +out_close_cat_tree: + hfs_btree_close(sbi->cat_tree); +out_close_ext_tree: + hfs_btree_close(sbi->ext_tree); +out_free_vhdr: + kfree(sbi->s_vhdr); + kfree(sbi->s_backup_vhdr); +out_unload_nls: + unload_nls(sbi->nls); unload_nls(nls); + kfree(sbi); +out: return err; } -- cgit v1.2.1 From 14dd01f88319a37b06ca909738044e39ec5bfdee Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Tue, 1 Feb 2011 16:41:55 -0500 Subject: hfsplus: do not leak buffer on error Signed-Off-By: Chuck Ebbert Signed-off-by: Christoph Hellwig --- fs/hfsplus/part_tbl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index d66ad113b1cc..40ad88c12c64 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb, res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, data, READ); if (res) - return res; + goto out; switch (be16_to_cpu(*((__be16 *)data))) { case HFS_OLD_PMAP_MAGIC: @@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb, res = -ENOENT; break; } - +out: kfree(data); return res; } -- cgit v1.2.1 From a1dbcef0172555464b5329f8ba47d43c98132dfa Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Wed, 2 Feb 2011 10:55:06 -0500 Subject: hfsplus: fix two memory leaks in wrapper.c Signed-Off-By: Chuck Ebbert Signed-off-by: Christoph Hellwig --- fs/hfsplus/wrapper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 196231794f64..3031d81f5f0f 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -167,7 +167,7 @@ reread: break; case cpu_to_be16(HFSP_WRAP_MAGIC): if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) - goto out; + goto out_free_backup_vhdr; wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; part_size = wd.embed_count * wd.ablk_size; @@ -179,7 +179,7 @@ reread: * (should do this only for cdrom/loop though) */ if (hfs_part_find(sb, &part_start, &part_size)) - goto out; + goto out_free_backup_vhdr; goto reread; } -- cgit v1.2.1 From 1065348d472f97b4b8eb53b60ec67e99148cbbca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Feb 2011 09:40:33 -0700 Subject: hfsplus: fix up a comparism in hfsplus_file_extend Revert an incorrect hunk from commit b2837fcf4994e699a4def002e26f274d95b387c1, "hfsplus: %L-to-%ll, macro correction, and remove unneeded braces" revert a pointless change of comparism operation argument order, which turned out to not even be equivalent. Reported-by: Joe Perches Signed-off-by: Christoph Hellwig --- fs/hfsplus/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 52a0bcaa7b6d..b1991a2a08e0 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode) u32 start, len, goal; int res; - if (sbi->total_blocks - sbi->free_blocks + 8 > - sbi->alloc_file->i_size * 8) { + if (sbi->alloc_file->i_size * 8 < + sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ printk(KERN_ERR "hfs: extend alloc file! " "(%llu,%u,%u)\n", -- cgit v1.2.1 From 76429c148b939f5a6863c0a024eb8960ae91469a Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 31 Jan 2011 16:03:08 +0300 Subject: CIFS: Fix variable types in cifs_iovec_read/write (try #2) Variable 'i' should be unsigned long as it's used in circle with num_pages, and bytes_read/total_written should be ssize_t according to return value. Signed-off-by: Pavel Shilovsky Reviewed-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/file.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 74c0a282d012..e964b1cd5dd0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1662,10 +1662,10 @@ static ssize_t cifs_iovec_write(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *poffset) { - size_t total_written = 0; - unsigned int written = 0; - unsigned long num_pages, npages; - size_t copied, len, cur_len, i; + unsigned int written; + unsigned long num_pages, npages, i; + size_t copied, len, cur_len; + ssize_t total_written = 0; struct kvec *to_send; struct page **pages; struct iov_iter it; @@ -1821,7 +1821,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, { int rc; int xid; - unsigned int total_read, bytes_read = 0; + ssize_t total_read; + unsigned int bytes_read = 0; size_t len, cur_len; int iov_offset = 0; struct cifs_sb_info *cifs_sb; -- cgit v1.2.1 From 78d2978874e4e10e97dfd4fd79db45bdc0748550 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 4 Feb 2011 18:13:24 +0000 Subject: CRED: Fix kernel panic upon security_file_alloc() failure. In get_empty_filp() since 2.6.29, file_free(f) is called with f->f_cred == NULL when security_file_alloc() returned an error. As a result, kernel will panic() due to put_cred(NULL) call within RCU callback. Fix this bug by assigning f->f_cred before calling security_file_alloc(). Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/file_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index c3e89adf53c0..eb36b6b17e26 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -125,13 +125,13 @@ struct file *get_empty_filp(void) goto fail; percpu_counter_inc(&nr_files); + f->f_cred = get_cred(cred); if (security_file_alloc(f)) goto fail_sec; INIT_LIST_HEAD(&f->f_u.fu_list); atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); - f->f_cred = get_cred(cred); spin_lock_init(&f->f_lock); eventpoll_init_file(f); /* f->f_version: 0 */ -- cgit v1.2.1 From 64474bdd07f673cc48509ea0375274422c8f73bf Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Thu, 3 Feb 2011 14:31:18 -0600 Subject: cifs: Possible slab memory corruption while updating extended stats (repost) Updating extended statistics here can cause slab memory corruption if a callback function frees slab memory (mid_entry). Signed-off-by: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 945b2202275f..1f32a2893b5f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -633,11 +633,11 @@ incomplete_rcv: mid_entry->largeBuf = isLargeBuf; multi_t2_fnd: mid_entry->midState = MID_RESPONSE_RECEIVED; - list_del_init(&mid_entry->qhead); - mid_entry->callback(mid_entry); #ifdef CONFIG_CIFS_STATS2 mid_entry->when_received = jiffies; #endif + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); break; } mid_entry = NULL; -- cgit v1.2.1 From e3f0dadb2b44746f6223ce4560406d19e02fb1cc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 4 Feb 2011 07:21:26 -0500 Subject: cifs: enable signing flag in SMB header when server has it on cifs_sign_smb only generates a signature if the correct Flags2 bit is set. Make sure that it gets set correctly if we're sending an async call. This patch fixes: https://bugzilla.kernel.org/show_bug.cgi?id=28142 Reported-and-Tested-by: JG Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/transport.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index b8c5e2eb43d0..fbc5aace54b1 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -359,6 +359,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf, if (rc) return rc; + /* enable signing if server requires it */ + if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + mutex_lock(&server->srv_mutex); mid = AllocMidQEntry(in_buf, server); if (mid == NULL) { -- cgit v1.2.1 From 247ec9b418ba50c9022280035330059364d54540 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 4 Feb 2011 17:09:50 -0500 Subject: cifs: don't send an echo request unless NegProt has been done When the socket to the server is disconnected, the client more or less immediately calls cifs_reconnect to reconnect the socket. The NegProt and SessSetup however are not done until an actual call needs to be made. With the addition of the SMB echo code, it's possible that the server will initiate a disconnect on an idle socket. The client will then reconnect the socket but no NegotiateProtocol request is done. The SMBEcho workqueue job will then eventually pop, and an SMBEcho will be sent on the socket. The server will then reject it since no NegProt was done. The ideal fix would be to either have the socket not be reconnected until we plan to use it, or to immediately do a NegProt when the reconnect occurs. The code is not structured for this however. For now we must just settle for not sending any echoes until the NegProt is done. Reported-by: JG Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1f32a2893b5f..257b6d895e20 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -337,8 +337,12 @@ cifs_echo_request(struct work_struct *work) struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); - /* no need to ping if we got a response recently */ - if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + /* + * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done. + * Also, no need to ping if we got a response recently + */ + if (server->tcpStatus != CifsGood || + time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) goto requeue_echo; rc = CIFSSMBEcho(server); -- cgit v1.2.1 From 8132b65bc6ce6d9a4baafdfc28c7cd9c258ed6e4 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Sun, 6 Feb 2011 02:05:28 +0300 Subject: cifs: add check for kmalloc in parse_dacl Exit from parse_dacl if no memory returned from the call to kmalloc. Signed-off-by: Stanislav Fomichev Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 1e7636b145a8..beeebf194234 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -372,6 +372,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), GFP_KERNEL); + if (!ppace) { + cERROR(1, "DACL memory allocation error"); + return; + } for (i = 0; i < num_aces; ++i) { ppace[i] = (struct cifs_ace *) (acl_base + acl_size); -- cgit v1.2.1 From 13dbc08987f25d9dba488a34b44b43e3844b027c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Feb 2011 02:39:52 +0000 Subject: Btrfs: make sure search_bitmap finds something in remove_from_bitmap When we're cleaning up the tree log we need to be able to remove free space from the block group. The problem is if that free space spans bitmaps we would not find the space since we're looking for too many bytes. So make sure the amount of bytes we search for is limited to either the number of bytes we want, or the number of bytes left in the bitmap. This was tested by a user who was hitting the BUG() after search_bitmap. With this patch he can now mount his fs. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a5501edc3c9f..a0390657451b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1216,6 +1216,7 @@ again: */ search_start = *offset; search_bytes = *bytes; + search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(block_group, bitmap_info, &search_start, &search_bytes); BUG_ON(ret < 0 || search_start != *offset); -- cgit v1.2.1 From 3c14874acc71180553fb5aba528e3cf57c5b958b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 Feb 2011 15:53:47 +0000 Subject: Btrfs: exclude super blocks when we read in block groups This has been resulting in a BUT_ON(ret) after btrfs_reserve_extent in btrfs_cow_file_range. The reason is we don't actually calculate the bytes_super for a block group until we go to cache it, which means that the space_info can hand out reservations for space that it doesn't actually have, and we can run out of data space. This is also a problem if you are using space caching since we don't ever calculate bytes_super for the block groups. So instead everytime we read a block group call exclude_super_stripes, which calculates the bytes_super for the block group so it can be left out of the space_info. Then whenever caching completes we just call free_excluded_extents so that the super excluded extents are freed up. Also if we are unmounting and we hit any block groups that haven't been cached we still need to call free_excluded_extents to make sure things are cleaned up properly. Thanks, Reported-by: Arne Jansen Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f07ba21cbf06..565e22d77b1b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -320,11 +320,6 @@ static int caching_kthread(void *data) if (!path) return -ENOMEM; - exclude_super_stripes(extent_root, block_group); - spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_readonly += block_group->bytes_super; - spin_unlock(&block_group->space_info->lock); - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); /* @@ -467,8 +462,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_NO; } spin_unlock(&cache->lock); - if (ret == 1) + if (ret == 1) { + free_excluded_extents(fs_info->extent_root, cache); return 0; + } } if (load_cache_only) @@ -4036,6 +4033,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); atomic_dec(&BTRFS_I(inode)->outstanding_extents); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); @@ -8325,6 +8323,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO) + free_excluded_extents(info->extent_root, block_group); + btrfs_remove_free_space_cache(block_group); btrfs_put_block_group(block_group); @@ -8439,6 +8444,13 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; + /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + exclude_super_stripes(root, cache); + /* * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't @@ -8447,12 +8459,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, root->fs_info, -- cgit v1.2.1 From 554233a6e0e8557e8e81e54cc70628d101291122 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 3 Feb 2011 03:16:25 +0000 Subject: btrfs: cleanup error handling in btrfs_unlink_inode() When btrfs_alloc_path() fails, btrfs_free_path() need not be called. Therefore, it changes the branch ahead. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 36bc3f49ebf9..c9bc0afdbfc6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2646,7 +2646,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto err; + goto out; } path->leave_spinning = 1; -- cgit v1.2.1 From 8e4eef7a60eeca0fe7503e5cbd3b24ff4941c732 Mon Sep 17 00:00:00 2001 From: Alexey Charkov Date: Wed, 2 Feb 2011 21:15:35 +0000 Subject: btrfs: Drop __exit attribute on btrfs_exit_compress As this function is called in some error paths while not removing the module, the __exit attribute prevents the kernel image from linking when btrfs is compiled in statically. Signed-off-by: Alexey Charkov Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 3a932f183da1..4d2110eafe29 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -921,7 +921,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, return ret; } -void __exit btrfs_exit_compress(void) +void btrfs_exit_compress(void) { free_workspaces(); } -- cgit v1.2.1 From d402539b8fc3fa21f16eb5e654be742670399e8a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 7 Feb 2011 08:54:35 -0500 Subject: cifs: remove checks for ses->status == CifsExiting ses->status is never set to CifsExiting, so these checks are always false. Tested-by: JG Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 46c66ed01af4..904aa47e3515 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) } } - if (ses->status == CifsExiting) - return -EIO; - /* * Give demultiplex thread up to 10 seconds to reconnect, should be * greater than cifs socket timeout which is 7 seconds @@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) * retrying until process is killed or server comes * back on-line */ - if (!tcon->retry || ses->status == CifsExiting) { + if (!tcon->retry) { cFYI(1, "gave up waiting on reconnect in smb_init"); return -EHOSTDOWN; } -- cgit v1.2.1 From d50bdd5aa55127635fd8a5c74bd2abb256bd34e3 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Mon, 7 Feb 2011 12:46:14 -0500 Subject: ext4: Fix data corruption with multi-block writepages support This fixes a corruption problem with the multi-block writepages submittal change for ext4, from commit bd2d0210cf22f2bd0cef72eb97cf94fc7d31d8cc ("ext4: use bio layer instead of buffer layer in mpage_da_submit_io"). (Note that this corruption is not present in 2.6.37 on ext4, because the corruption was detected after the feature was merged in 2.6.37-rc1, and so it was turned off by adding a non-default mount option, mblk_io_submit. With this commit, which hopefully fixes the last of the bugs with this feature, we'll be able to turn on this performance feature by default in 2.6.38, and remove the mblk_io_submit option.) The ext4 code path to bundle multiple pages for writeback in ext4_bio_write_page() had a bug: we should be clearing buffer head dirty flags *before* we submit the bio, not in the completion routine. The patch below was tested on 2.6.37 under KVM with the postgresql script which was submitted by Jon Nelson as documented in commit 1449032be1. Without the patch, I'd hit the corruption problem about 50-70% of the time. With the patch, I executed the script > 100 times with no corruption seen. I also fixed a bug to make sure ext4_end_bio() doesn't dereference the bio after the bio_put() call. Reported-by: Jon Nelson Reported-by: Matthias Bayer Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/page-io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7270dcfca92a..4e9b0a242f4c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -190,6 +190,7 @@ static void ext4_end_bio(struct bio *bio, int error) struct inode *inode; unsigned long flags; int i; + sector_t bi_sector = bio->bi_sector; BUG_ON(!io_end); bio->bi_private = NULL; @@ -207,9 +208,7 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) SetPageError(page); BUG_ON(!head); - if (head->b_size == PAGE_CACHE_SIZE) - clear_buffer_dirty(head); - else { + if (head->b_size != PAGE_CACHE_SIZE) { loff_t offset; loff_t io_end_offset = io_end->offset + io_end->size; @@ -221,7 +220,6 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) buffer_io_error(bh); - clear_buffer_dirty(bh); } if (buffer_delay(bh)) partial_write = 1; @@ -257,7 +255,7 @@ static void ext4_end_bio(struct bio *bio, int error) (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) - bio->bi_sector >> (inode->i_blkbits - 9)); + bi_sector >> (inode->i_blkbits - 9)); } /* Add the io_end to per-inode completed io list*/ @@ -380,6 +378,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, blocksize = 1 << inode->i_blkbits; + BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); set_page_writeback(page); ClearPageError(page); @@ -397,12 +396,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io, for (bh = head = page_buffers(page), block_start = 0; bh != head || !block_start; block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; if (block_start >= len) { clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; } + clear_buffer_dirty(bh); ret = io_submit_add_bh(io, io_page, inode, wbc, bh); if (ret) { /* -- cgit v1.2.1 From 3a90983dbdcb2f4f48c0d771d8e5b4d88f27fae6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 18 Jan 2011 13:34:40 +0800 Subject: Btrfs: Fix page count calculation take offset of start position into account when calculating page count. Signed-off-by: Yan, Zheng Signed-off-by: Chris Mason --- fs/btrfs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9e097fbfc78d..b0ff34b96607 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -991,8 +991,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, size_t write_bytes = min(iov_iter_count(&i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + size_t num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); @@ -1022,8 +1022,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; if (num_pages > dirty_pages) { if (copied > 0) -- cgit v1.2.1 From 7e90d705fc9f8c5e3a1549281dce0654d049243b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 8 Feb 2011 23:52:32 +0000 Subject: [CIFS] Do not send SMBEcho requests on new sockets until SMBNegotiate In order to determine whether an SMBEcho request can be sent we need to know that the socket is established (server tcpStatus == CifsGood) AND that an SMB NegotiateProtocol has been sent (server maxBuf != 0). Without the second check we can send an Echo request during reconnection before the server can accept it. CC: JG Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/connect.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index edd5b29b53c9..1ab33eb71d95 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -188,6 +188,8 @@ struct TCP_Server_Info { /* multiplexed reads or writes */ unsigned int maxBuf; /* maxBuf specifies the maximum */ /* message size the server can send or receive for non-raw SMBs */ + /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */ + /* when socket is setup (and during reconnect) before NegProt sent */ unsigned int max_rw; /* maxRw specifies the maximum */ /* message size the server can send or receive for */ /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 257b6d895e20..10011e99b34d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -341,7 +341,7 @@ cifs_echo_request(struct work_struct *work) * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done. * Also, no need to ping if we got a response recently */ - if (server->tcpStatus != CifsGood || + if ((server->tcpStatus != CifsGood) || (server->maxBuf == 0) || time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) goto requeue_echo; -- cgit v1.2.1 From 195291e68c2ad59a046fc56d32bf59635b100e5c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 9 Feb 2011 12:01:42 -0500 Subject: cifs: clean up checks in cifs_echo_request Follow-on patch to 7e90d705 which is already in Steve's tree... The check for tcpStatus == CifsGood is not meaningful since it doesn't indicate whether the NEGOTIATE request has been done. Also, clarify why we're checking for maxBuf == 0. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 10011e99b34d..161f24ca4f6e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -338,10 +338,11 @@ cifs_echo_request(struct work_struct *work) struct TCP_Server_Info, echo.work); /* - * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done. - * Also, no need to ping if we got a response recently + * We cannot send an echo until the NEGOTIATE_PROTOCOL request is + * done, which is indicated by maxBuf != 0. Also, no need to ping if + * we got a response recently */ - if ((server->tcpStatus != CifsGood) || (server->maxBuf == 0) || + if (server->maxBuf == 0 || time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) goto requeue_echo; -- cgit v1.2.1 From 71823baff1978be892e7a36eddf6170e1cc6650d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 10 Feb 2011 08:03:50 -0500 Subject: cifs: don't always drop malformed replies on the floor (try #3) Slight revision to this patch...use min_t() instead of conditional assignment. Also, remove the FIXME comment and replace it with the explanation that Steve gave earlier. After receiving a packet, we currently check the header. If it's no good, then we toss it out and continue the loop, leaving the caller waiting on that response. In cases where the packet has length inconsistencies, but the MID is valid, this leads to unneeded delays. That's especially problematic now that the client waits indefinitely for responses. Instead, don't immediately discard the packet if checkSMB fails. Try to find a matching mid_q_entry, mark it as having a malformed response and issue the callback. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 +- fs/cifs/connect.c | 30 ++++++++++++++++++++++++------ fs/cifs/transport.c | 3 +++ 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1ab33eb71d95..17afb0fbcaed 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -654,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, #define MID_REQUEST_SUBMITTED 2 #define MID_RESPONSE_RECEIVED 4 #define MID_RETRY_NEEDED 8 /* session closed while this request out */ -#define MID_NO_RESP_NEEDED 0x10 +#define MID_RESPONSE_MALFORMED 0x10 /* Types of response buffer returned from SendReceive2 */ #define CIFS_NO_BUFFER 0 /* Response buffer not returned */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 161f24ca4f6e..8d6c17ab593d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -586,11 +586,20 @@ incomplete_rcv: total_read += 4; /* account for rfc1002 hdr */ dump_smb(smb_buffer, total_read); - if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) { + + /* + * We know that we received enough to get to the MID as we + * checked the pdu_length earlier. Now check to see + * if the rest of the header is OK. We borrow the length + * var for the rest of the loop to avoid a new stack var. + * + * 48 bytes is enough to display the header and a little bit + * into the payload for debugging purposes. + */ + length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); + if (length != 0) cifs_dump_mem("Bad SMB: ", smb_buffer, - total_read < 48 ? total_read : 48); - continue; - } + min_t(unsigned int, total_read, 48)); mid_entry = NULL; server->lstrp = jiffies; @@ -602,7 +611,8 @@ incomplete_rcv: if ((mid_entry->mid == smb_buffer->Mid) && (mid_entry->midState == MID_REQUEST_SUBMITTED) && (mid_entry->command == smb_buffer->Command)) { - if (check2ndT2(smb_buffer,server->maxBuf) > 0) { + if (length == 0 && + check2ndT2(smb_buffer, server->maxBuf) > 0) { /* We have a multipart transact2 resp */ isMultiRsp = true; if (mid_entry->resp_buf) { @@ -637,7 +647,12 @@ incomplete_rcv: mid_entry->resp_buf = smb_buffer; mid_entry->largeBuf = isLargeBuf; multi_t2_fnd: - mid_entry->midState = MID_RESPONSE_RECEIVED; + if (length == 0) + mid_entry->midState = + MID_RESPONSE_RECEIVED; + else + mid_entry->midState = + MID_RESPONSE_MALFORMED; #ifdef CONFIG_CIFS_STATS2 mid_entry->when_received = jiffies; #endif @@ -658,6 +673,9 @@ multi_t2_fnd: else smallbuf = NULL; } + } else if (length != 0) { + /* response sanity checks failed */ + continue; } else if (!is_valid_oplock_break(smb_buffer, server) && !isMultiRsp) { cERROR(1, "No task to wake, unknown frame received! " diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index fbc5aace54b1..46d8756f2b24 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -457,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) case MID_RETRY_NEEDED: rc = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + rc = -EIO; + break; default: cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__, mid->mid, mid->midState); -- cgit v1.2.1 From 6b155c8fd4d239f7d883d455bbad1be47724bbfc Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 11 Feb 2011 16:44:31 -0600 Subject: dlm: use single thread workqueues The recent commit to use cmwq for send and recv threads dcce240ead802d42b1e45ad2fcb2ed4a399cb255 introduced problems, apparently due to multiple workqueue threads. Single threads make the problems go away, so return to that until we fully understand the concurrency issues with multiple threads. Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9c64ae9e4c1a..2d8c87b951c2 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1468,15 +1468,13 @@ static void work_stop(void) static int work_start(void) { - recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_FREEZEABLE, 0); + recv_workqueue = create_singlethread_workqueue("dlm_recv"); if (!recv_workqueue) { log_print("can't start dlm_recv"); return -ENOMEM; } - send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_FREEZEABLE, 0); + send_workqueue = create_singlethread_workqueue("dlm_send"); if (!send_workqueue) { log_print("can't start dlm_send"); destroy_workqueue(recv_workqueue); -- cgit v1.2.1 From 2dab597441667d6c04451a7dcf215241ad4c74f6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 11 Feb 2011 15:53:38 -0800 Subject: Fix possible filp_cachep memory corruption In commit 31e6b01f4183 ("fs: rcu-walk for path lookup") we started doing path lookup using RCU, which then falls back to a careful non-RCU lookup in case of problems (LOOKUP_REVAL). So do_filp_open() has this "re-do the lookup carefully" looping case. However, that means that we must not release the open-intent file data if we are going to loop around and use it once more! Fix this by moving the release of the open-intent data to the function that allocates it (do_filp_open() itself) rather than the helper functions that can get called multiple times (finish_open() and do_last()). This makes the logic for the lifetime of that field much more obvious, and avoids the possible double free. Reported-by: J. R. Okajima Acked-by: Al Viro Cc: Nick Piggin Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 20 ++++++++++---------- fs/open.c | 2 ++ 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 7d77f24d32a9..ec4b2d0190a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -561,10 +561,14 @@ static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) */ void release_open_intent(struct nameidata *nd) { - if (nd->intent.open.file->f_path.dentry == NULL) - put_filp(nd->intent.open.file); - else - fput(nd->intent.open.file); + struct file *file = nd->intent.open.file; + + if (file && !IS_ERR(file)) { + if (file->f_path.dentry == NULL) + put_filp(file); + else + fput(file); + } } /* @@ -2265,8 +2269,6 @@ static struct file *finish_open(struct nameidata *nd, return filp; exit: - if (!IS_ERR(nd->intent.open.file)) - release_open_intent(nd); path_put(&nd->path); return ERR_PTR(error); } @@ -2389,8 +2391,6 @@ exit_mutex_unlock: exit_dput: path_put_conditional(path, nd); exit: - if (!IS_ERR(nd->intent.open.file)) - release_open_intent(nd); path_put(&nd->path); return ERR_PTR(error); } @@ -2477,6 +2477,7 @@ struct file *do_filp_open(int dfd, const char *pathname, } audit_inode(pathname, nd.path.dentry); filp = finish_open(&nd, open_flag, acc_mode); + release_open_intent(&nd); return filp; creat: @@ -2553,6 +2554,7 @@ out: path_put(&nd.root); if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) goto reval; + release_open_intent(&nd); return filp; exit_dput: @@ -2560,8 +2562,6 @@ exit_dput: out_path: path_put(&nd.path); out_filp: - if (!IS_ERR(nd.intent.open.file)) - release_open_intent(&nd); filp = ERR_PTR(error); goto out; } diff --git a/fs/open.c b/fs/open.c index e52389e1f05b..5a2c6ebc22b5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -790,6 +790,8 @@ struct file *nameidata_to_filp(struct nameidata *nd) /* Pick up the filp from the open intent */ filp = nd->intent.open.file; + nd->intent.open.file = NULL; + /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) { path_get(&nd->path); -- cgit v1.2.1 From d863b50ab01333659314c2034890cb76d9fdc3c7 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 10 Feb 2011 15:01:20 -0800 Subject: vfs: call rcu_barrier after ->kill_sb() In commit fa0d7e3de6d6 ("fs: icache RCU free inodes"), we use rcu free inode instead of freeing the inode directly. It causes a crash when we rmmod immediately after we umount the volume[1]. So we need to call rcu_barrier after we kill_sb so that the inode is freed before we do rmmod. The idea is inspired by Aneesh Kumar. rcu_barrier will wait for all callbacks to end before preceding. The original patch was done by Tao Ma, but synchronize_rcu() is not enough here. 1. http://marc.info/?l=linux-fsdevel&m=129680863330185&w=2 Tested-by: Tao Ma Signed-off-by: Boaz Harrosh Cc: Nick Piggin Cc: Al Viro Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 74e149efed81..7e9dd4cc2c01 100644 --- a/fs/super.c +++ b/fs/super.c @@ -177,6 +177,11 @@ void deactivate_locked_super(struct super_block *s) struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { fs->kill_sb(s); + /* + * We need to call rcu_barrier so all the delayed rcu free + * inodes are flushed before we release the fs module. + */ + rcu_barrier(); put_filesystem(fs); put_super(s); } else { -- cgit v1.2.1 From 2892c15ddda6a76dc10b7499e56c0f3b892e5a69 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 12 Feb 2011 08:12:18 -0500 Subject: ext4: make grpinfo slab cache names static In 2.6.37 I was running into oopses with repeated module loads & unloads. I tracked this down to: fb1813f4 ext4: use dedicated slab caches for group_info structures (this was in addition to the features advert unload problem) The kstrdup & subsequent kfree of the cache name was causing a double free. In slub, at least, if I read it right it allocates & frees the name itself, slab seems to do something different... so in slub I think we were leaking -our- cachep->name, and double freeing the one allocated by slub. After getting lost in slab/slub/slob a bit, I just looked at other sized-caches that get allocated. jbd2, biovec, sgpool all do it more or less the way jbd2 does. Below patch follows the jbd2 method of dynamically allocating a cache at mount time from a list of static names. (This might also possibly fix a race creating the caches with parallel mounts running). [Folded in a fix from Dan Carpenter which fixed an off-by-one error in the original patch] Cc: stable@kernel.org Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 100 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 851f49b2f9d2..d1fe09aea73d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ -#define NR_GRPINFO_CACHES \ - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1) +#define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; +static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", + "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", + "ext4_groupinfo_64k", "ext4_groupinfo_128k" +}; + static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, @@ -2414,6 +2419,55 @@ err_freesgi: return -ENOMEM; } +static void ext4_groupinfo_destroy_slabs(void) +{ + int i; + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + if (ext4_groupinfo_caches[i]) + kmem_cache_destroy(ext4_groupinfo_caches[i]); + ext4_groupinfo_caches[i] = NULL; + } +} + +static int ext4_groupinfo_create_slab(size_t size) +{ + static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); + int slab_size; + int blocksize_bits = order_base_2(size); + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep; + + if (cache_index >= NR_GRPINFO_CACHES) + return -EINVAL; + + if (unlikely(cache_index < 0)) + cache_index = 0; + + mutex_lock(&ext4_grpinfo_slab_create_mutex); + if (ext4_groupinfo_caches[cache_index]) { + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = offsetof(struct ext4_group_info, + bb_counters[blocksize_bits + 2]); + + cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], + slab_size, 0, SLAB_RECLAIM_ACCOUNT, + NULL); + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { + printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + + ext4_groupinfo_caches[cache_index] = cachep; + + return 0; +} + int ext4_mb_init(struct super_block *sb, int needs_recovery) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) unsigned offset; unsigned max; int ret; - int cache_index; - struct kmem_cache *cachep; - char *namep = NULL; i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); @@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) goto out; } - cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; - cachep = ext4_groupinfo_caches[cache_index]; - if (!cachep) { - char name[32]; - int len = offsetof(struct ext4_group_info, - bb_counters[sb->s_blocksize_bits + 2]); - - sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits); - namep = kstrdup(name, GFP_KERNEL); - if (!namep) { - ret = -ENOMEM; - goto out; - } - - /* Need to free the kmem_cache_name() when we - * destroy the slab */ - cachep = kmem_cache_create(namep, len, 0, - SLAB_RECLAIM_ACCOUNT, NULL); - if (!cachep) { - ret = -ENOMEM; - goto out; - } - ext4_groupinfo_caches[cache_index] = cachep; - } + ret = ext4_groupinfo_create_slab(sb->s_blocksize); + if (ret < 0) + goto out; /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; @@ -2520,7 +2550,6 @@ out: if (ret) { kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); - kfree(namep); } return ret; } @@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void) void ext4_exit_mballoc(void) { - int i; /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. @@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_ext_cachep); - - for (i = 0; i < NR_GRPINFO_CACHES; i++) { - struct kmem_cache *cachep = ext4_groupinfo_caches[i]; - if (cachep) { - char *name = (char *)kmem_cache_name(cachep); - kmem_cache_destroy(cachep); - kfree(name); - } - } + ext4_groupinfo_destroy_slabs(); ext4_remove_debugfs_entry(); } -- cgit v1.2.1 From e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 12 Feb 2011 08:17:34 -0500 Subject: ext4: serialize unaligned asynchronous DIO ext4 has a data corruption case when doing non-block-aligned asynchronous direct IO into a sparse file, as demonstrated by xfstest 240. The root cause is that while ext4 preallocates space in the hole, mappings of that space still look "new" and dio_zero_block() will zero out the unwritten portions. When more than one AIO thread is going, they both find this "new" block and race to zero out their portion; this is uncoordinated and causes data corruption. Dave Chinner fixed this for xfs by simply serializing all unaligned asynchronous direct IO. I've done the same here. The difference is that we only wait on conversions, not all IO. This is a very big hammer, and I'm not very pleased with stuffing this into ext4_file_write(). But since ext4 is DIO_LOCKING, we need to serialize it at this high level. I tried to move this into ext4_ext_direct_IO, but by then we have the i_mutex already, and we will wait on the work queue to do conversions - which must also take the i_mutex. So that won't work. This was originally exposed by qemu-kvm installing to a raw disk image with a normal sector-63 alignment. I've tested a backport of this patch with qemu, and it does avoid the corruption. It is also quite a lot slower (14 min for package installs, vs. 8 min for well-aligned) but I'll take slow correctness over fast corruption any day. Mingming suggested that we can track outstanding conversions, and wait on those so that non-sparse files won't be affected, and I've implemented that here; unaligned AIO to nonsparse files won't take a perf hit. [tytso@mit.edu: Keep the mutex as a hashed array instead of bloating the ext4 inode] [tytso@mit.edu: Fix up namespace issues so that global variables are protected with an "ext4_" prefix.] Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 10 ++++++++++ fs/ext4/extents.c | 10 ++++++---- fs/ext4/file.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/page-io.c | 25 ++++++++++++----------- fs/ext4/super.c | 13 +++++++++++- 5 files changed, 100 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0c8d97b56f34..3aa0b72b3b94 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -848,6 +848,7 @@ struct ext4_inode_info { atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 63a75810b7c3..ccce8a7e94ed 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * that this IO needs to convertion to written when IO is * completed */ - if (io) + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { io->flag = EXT4_IO_END_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; @@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * that we need to perform convertion when IO is done. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { io->flag = EXT4_IO_END_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2e8322c8aa88..7b80d543b89e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } +static void ext4_aiodio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ext4_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete. Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block. If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static int +ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct super_block *sb = inode->i_sb; + int blockmask = sb->s_blocksize - 1; + size_t count = iov_length(iov, nr_segs); + loff_t final_size = pos + count; + + if (pos >= inode->i_size) + return 0; + + if ((pos & blockmask) || (final_size & blockmask)) + return 1; + + return 0; +} + static ssize_t ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + int unaligned_aio = 0; + int ret; /* * If we have encountered a bitmap-format file, the size limit @@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, nr_segs = iov_shorten((struct iovec *)iov, nr_segs, sbi->s_bitmap_maxbytes - pos); } + } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) && + !is_sync_kiocb(iocb))) { + unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); } - return generic_file_aio_write(iocb, iov, nr_segs, pos); + /* Unaligned direct AIO must be serialized; see comment above */ + if (unaligned_aio) { + static unsigned long unaligned_warn_time; + + /* Warn about this once per day */ + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) + ext4_msg(inode->i_sb, KERN_WARNING, + "Unaligned AIO/DIO on inode %ld by %s; " + "performance will be poor.", + inode->i_ino, current->comm); + mutex_lock(ext4_aio_mutex(inode)); + ext4_aiodio_wait(inode); + } + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + + if (unaligned_aio) + mutex_unlock(ext4_aio_mutex(inode)); + + return ret; } static const struct vm_operations_struct ext4_file_vm_ops = { diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4e9b0a242f4c..955cc309142f 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -32,14 +32,8 @@ static struct kmem_cache *io_page_cachep, *io_end_cachep; -#define WQ_HASH_SZ 37 -#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) -static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; - int __init ext4_init_pageio(void) { - int i; - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); if (io_page_cachep == NULL) return -ENOMEM; @@ -48,9 +42,6 @@ int __init ext4_init_pageio(void) kmem_cache_destroy(io_page_cachep); return -ENOMEM; } - for (i = 0; i < WQ_HASH_SZ; i++) - init_waitqueue_head(&ioend_wq[i]); - return 0; } @@ -62,7 +53,7 @@ void ext4_exit_pageio(void) void ext4_ioend_wait(struct inode *inode) { - wait_queue_head_t *wq = to_ioend_wq(inode); + wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); } @@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io) for (i = 0; i < io->num_io_pages; i++) put_io_page(io->pages[i]); io->num_io_pages = 0; - wq = to_ioend_wq(io->inode); + wq = ext4_ioend_wq(io->inode); if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && waitqueue_active(wq)) wake_up_all(wq); @@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io) struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; + wait_queue_head_t *wq; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," @@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io) if (io->iocb) aio_complete(io->iocb, io->result, 0); /* clear the DIO AIO unwritten flag */ - io->flag &= ~EXT4_IO_END_UNWRITTEN; + if (io->flag & EXT4_IO_END_UNWRITTEN) { + io->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + wq = ext4_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) && + waitqueue_active(wq)) { + wake_up_all(wq); + } + } + return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 86b05486dc63..f6a318f836b2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -833,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); + atomic_set(&ei->i_aiodio_unwritten, 0); return &ei->vfs_inode; } @@ -4800,11 +4801,21 @@ static void ext4_exit_feat_adverts(void) kfree(ext4_feat); } +/* Shared across all ext4 file systems */ +wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + static int __init ext4_init_fs(void) { - int err; + int i, err; ext4_check_flag_values(); + + for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { + mutex_init(&ext4__aio_mutex[i]); + init_waitqueue_head(&ext4__ioend_wq[i]); + } + err = ext4_init_pageio(); if (err) return err; -- cgit v1.2.1 From e44718318004a5618d1dfe2d080e2862532d8e5f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Feb 2011 08:18:24 -0500 Subject: jbd2: call __jbd2_log_start_commit with j_state_lock write locked On an SMP ARM system running ext4, I've received a report that the first J_ASSERT in jbd2_journal_commit_transaction has been triggering: J_ASSERT(journal->j_running_transaction != NULL); While investigating possible causes for this problem, I noticed that __jbd2_log_start_commit() is getting called with j_state_lock only read-locked, in spite of the fact that it's possible for it might j_commit_request. Fix this by grabbing the necessary information so we can test to see if we need to start a new transaction before dropping the read lock, and then calling jbd2_log_start_commit() which will grab the write lock. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 9 +++++++-- fs/jbd2/transaction.c | 21 ++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9e4686900f18..97e73469b2c4 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction commit was started. + * Called with j_state_lock locked for writing. + * Returns true if a transaction commit was started. */ int __jbd2_log_start_commit(journal_t *journal, tid_t target) { @@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal) { transaction_t *transaction = NULL; tid_t tid; + int need_to_start = 0; read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { transaction = journal->j_running_transaction; - __jbd2_log_start_commit(journal, transaction->t_tid); + if (!tid_geq(journal->j_commit_request, transaction->t_tid)) + need_to_start = 1; } else if (journal->j_committing_transaction) transaction = journal->j_committing_transaction; @@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal) tid = transaction->t_tid; read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); jbd2_log_wait_commit(journal, tid); return 1; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index faad2bd787c7..1d1191050f99 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction) static int start_this_handle(journal_t *journal, handle_t *handle, int gfp_mask) { - transaction_t *transaction; - int needed; - int nblocks = handle->h_buffer_credits; - transaction_t *new_transaction = NULL; + transaction_t *transaction, *new_transaction = NULL; + tid_t tid; + int needed, need_to_start; + int nblocks = handle->h_buffer_credits; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", @@ -222,8 +222,11 @@ repeat: atomic_sub(nblocks, &transaction->t_outstanding_credits); prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); - __jbd2_log_start_commit(journal, transaction->t_tid); + tid = transaction->t_tid; + need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int ret; + tid_t tid; + int need_to_start, ret; /* If we've had an abort of any type, don't even think about * actually doing the restart! */ @@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "restarting handle %p\n", handle); - __jbd2_log_start_commit(journal, transaction->t_tid); + tid = transaction->t_tid; + need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; -- cgit v1.2.1 From 541ce98c10111dae7604543dda6c6f7e7a6015d8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 14 Jan 2011 20:00:02 -0500 Subject: nfsd: don't leak dentry count on mnt_want_write failure The exit cleanup isn't quite right here. Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 641117f2188d..fda3be237773 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1812,22 +1812,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); if (host_err) - goto out_nfserr; + goto out_put; host_err = nfsd_break_lease(rdentry->d_inode); if (host_err) - goto out_put; + goto out_drop_write; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry); else host_err = vfs_rmdir(dirp, rdentry); -out_put: - dput(rdentry); - if (!host_err) host_err = commit_metadata(fhp); - +out_drop_write: mnt_drop_write(fhp->fh_export->ex_path.mnt); +out_put: + dput(rdentry); + out_nfserr: err = nfserrno(host_err); out: -- cgit v1.2.1 From 0af3f814ccf0a13d3e01e8115b96f1824379fc72 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Thu, 13 Jan 2011 11:25:31 +0200 Subject: NFSD: use nfserr for status after decode_cb_op_status Bugs introduced in 85a56480191ca9f08fc775c129b9eb5c8c1f2c05 "NFSD: Update XDR decoders in NFSv4 callback client" Cc: Chuck Lever Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3be975e18919..cde36cb0f348 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -484,7 +484,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, out: return status; out_default: - return nfs_cb_stat_to_errno(status); + return nfs_cb_stat_to_errno(nfserr); } /* @@ -564,11 +564,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (unlikely(status)) goto out; if (unlikely(nfserr != NFS4_OK)) - goto out_default; + status = nfs_cb_stat_to_errno(nfserr); out: return status; -out_default: - return nfs_cb_stat_to_errno(status); } /* -- cgit v1.2.1 From 3aa6e0aa8ab3e64bbfba092c64d42fd1d006b124 Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Tue, 1 Feb 2011 17:16:29 +0300 Subject: NFSD: memory corruption due to writing beyond the stat array If nfsd fails to find an exported via NFS file in the readahead cache, it should increment corresponding nfsdstats counter (ra_depth[10]), but due to a bug it may instead write to ra_depth[11], corrupting the following field. In a kernel with NFSDv4 compiled in the corruption takes the form of an increment of a counter of the number of NFSv4 operation 0's received; since there is no operation 0, this is harmless. In a kernel with NFSDv4 disabled it corrupts whatever happens to be in the memory beyond nfsdstats. Signed-off-by: Konstantin Khorenko Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fda3be237773..30c73f8a5791 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -808,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino) if (ra->p_count == 0) frap = rap; } - depth = nfsdstats.ra_size*11/10; + depth = nfsdstats.ra_size; if (!frap) { spin_unlock(&rab->pb_lock); return NULL; -- cgit v1.2.1 From 6b57d9c86d0ab11c091b6db2edff8b5553fd445b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 31 Jan 2011 11:54:04 -0500 Subject: nfsd4: split up nfsd_break_deleg_cb We'll be adding some more code here soon. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d98d0213285d..ceb66170fda3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2329,23 +2329,8 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) nfs4_file_put_access(fp, O_RDONLY); } -/* - * Spawn a thread to perform a recall on the delegation represented - * by the lease (file_lock) - * - * Called from break_lease() with lock_flocks() held. - * Note: we assume break_lease will only call this *once* for any given - * lease. - */ -static -void nfsd_break_deleg_cb(struct file_lock *fl) +static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { - struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; - - dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl); - if (!dp) - return; - /* We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the kernel @@ -2360,15 +2345,28 @@ void nfsd_break_deleg_cb(struct file_lock *fl) /* only place dl_time is set. protected by lock_flocks*/ dp->dl_time = get_seconds(); + nfsd4_cb_recall(dp); +} + +/* + * Called from break_lease() with lock_flocks() held. + * Note: we assume break_lease will only call this *once* for any given + * lease. + */ +static void nfsd_break_deleg_cb(struct file_lock *fl) +{ + struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; + + BUG_ON(!dp); /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if the delegation isn't returned - * in time. + * in time: */ fl->fl_break_time = 0; + nfsd_break_one_deleg(dp); dp->dl_file->fi_had_conflict = true; - nfsd4_cb_recall(dp); } static -- cgit v1.2.1 From 22d38c4c10e8344aa406897d99a35d585d2cb77d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 31 Jan 2011 11:55:12 -0500 Subject: nfsd4: add helper function for lease setup Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ceb66170fda3..65978a9aa877 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2639,6 +2639,26 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) +{ + struct file_lock *fl; + + fl = locks_alloc_lock(); + if (!fl) + return NULL; + locks_init_lock(fl); + fl->fl_lmops = &nfsd_lease_mng_ops; + fl->fl_flags = FL_LEASE; + fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = (fl_owner_t)dp; + fl->fl_file = dp->dl_vfs_file; + BUG_ON(!fl->fl_file); + fl->fl_pid = current->tgid; + dp->dl_flock = fl; + return fl; +} + /* * Attempt to hand out a delegation. */ @@ -2684,20 +2704,9 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta goto out; } status = -ENOMEM; - fl = locks_alloc_lock(); + fl = nfs4_alloc_init_lease(dp, flag); if (!fl) goto out; - locks_init_lock(fl); - fl->fl_lmops = &nfsd_lease_mng_ops; - fl->fl_flags = FL_LEASE; - fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; - fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)dp; - fl->fl_file = find_readable_file(stp->st_file); - BUG_ON(!fl->fl_file); - fl->fl_pid = current->tgid; - dp->dl_flock = fl; - /* vfs_setlease checks to see if delegation should be handed out. * the lock_manager callback fl_change is used */ -- cgit v1.2.1 From dd239cc05f0ad9f582dd83d88a4fb5edcc57a026 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 31 Jan 2011 17:14:55 -0500 Subject: nfsd4: fix leak on allocation error Also share some common exit code. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 65978a9aa877..099d6fa64f7f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2699,14 +2699,12 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta } dp = alloc_init_deleg(sop->so_client, stp, fh, flag); - if (dp == NULL) { - flag = NFS4_OPEN_DELEGATE_NONE; - goto out; - } + if (dp == NULL) + goto out_no_deleg; status = -ENOMEM; fl = nfs4_alloc_init_lease(dp, flag); if (!fl) - goto out; + goto out_free; /* vfs_setlease checks to see if delegation should be handed out. * the lock_manager callback fl_change is used */ @@ -2714,9 +2712,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta dprintk("NFSD: setlease failed [%d], no delegation\n", status); dp->dl_flock = NULL; locks_free_lock(fl); - unhash_delegation(dp); - flag = NFS4_OPEN_DELEGATE_NONE; - goto out; + goto out_free; } memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); @@ -2729,6 +2725,12 @@ out: && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) dprintk("NFSD: WARNING: refusing delegation reclaim\n"); open->op_delegate_type = flag; + return; +out_free: + unhash_delegation(dp); +out_no_deleg: + flag = NFS4_OPEN_DELEGATE_NONE; + goto out; } /* -- cgit v1.2.1 From edab9782b5a16abb8d139d261e81e13ef0be35a9 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 31 Jan 2011 17:58:10 -0500 Subject: nfsd4: split lease setting into separate function Splitting some code into a separate function which we'll be adding some more to. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 099d6fa64f7f..dbb2141cf88f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2659,6 +2659,23 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f return fl; } +static int nfs4_setlease(struct nfs4_delegation *dp, int flag) +{ + struct file_lock *fl; + int status; + + fl = nfs4_alloc_init_lease(dp, flag); + if (!fl) + return -ENOMEM; + status = vfs_setlease(dp->dl_vfs_file, fl->fl_type, &fl); + if (status) { + dp->dl_flock = NULL; + locks_free_lock(fl); + return -ENOMEM; + } + return 0; +} + /* * Attempt to hand out a delegation. */ @@ -2668,7 +2685,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta struct nfs4_delegation *dp; struct nfs4_stateowner *sop = stp->st_stateowner; int cb_up; - struct file_lock *fl; int status, flag = 0; cb_up = nfsd4_cb_channel_good(sop->so_client); @@ -2701,19 +2717,9 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta dp = alloc_init_deleg(sop->so_client, stp, fh, flag); if (dp == NULL) goto out_no_deleg; - status = -ENOMEM; - fl = nfs4_alloc_init_lease(dp, flag); - if (!fl) - goto out_free; - /* vfs_setlease checks to see if delegation should be handed out. - * the lock_manager callback fl_change is used - */ - if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { - dprintk("NFSD: setlease failed [%d], no delegation\n", status); - dp->dl_flock = NULL; - locks_free_lock(fl); + status = nfs4_setlease(dp, flag); + if (status) goto out_free; - } memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); -- cgit v1.2.1 From 65bc58f5187e2ff4011ef1bd3082e83cd1b036f1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 7 Feb 2011 15:44:12 -0500 Subject: nfsd4: remove unused deleg dprintk's. These aren't all that useful, and get in the way of the next steps. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index dbb2141cf88f..d978192838a3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -958,8 +958,6 @@ expire_client(struct nfs4_client *clp) spin_lock(&recall_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - dprintk("NFSD: expire client. dp %p, fp %p\n", dp, - dp->dl_flock); list_del_init(&dp->dl_perclnt); list_move(&dp->dl_recall_lru, &reaplist); } @@ -2931,8 +2929,6 @@ nfs4_laundromat(void) test_val = u; break; } - dprintk("NFSD: purging unused delegation dp %p, fp %p\n", - dp, dp->dl_flock); list_move(&dp->dl_recall_lru, &reaplist); } spin_unlock(&recall_lock); -- cgit v1.2.1 From 5d926e8c2f46dc09f4ddde86644a5f1d0726a470 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 7 Feb 2011 16:53:46 -0500 Subject: nfsd4: modify fi_delegations under recall_lock Modify fi_delegations only under the recall_lock, allowing us to use that list on lease breaks. Also some trivial cleanup to simplify later changes. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d978192838a3..8b6cd3cf4835 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -277,9 +277,9 @@ nfs4_close_delegation(struct nfs4_delegation *dp) static void unhash_delegation(struct nfs4_delegation *dp) { - list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); + list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_recall_lru); spin_unlock(&recall_lock); nfs4_close_delegation(dp); @@ -2336,9 +2336,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * it's safe to take a reference: */ atomic_inc(&dp->dl_count); - spin_lock(&recall_lock); list_add_tail(&dp->dl_recall_lru, &del_recall_lru); - spin_unlock(&recall_lock); /* only place dl_time is set. protected by lock_flocks*/ dp->dl_time = get_seconds(); @@ -2363,8 +2361,10 @@ static void nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - nfsd_break_one_deleg(dp); + spin_lock(&recall_lock); dp->dl_file->fi_had_conflict = true; + nfsd_break_one_deleg(dp); + spin_unlock(&recall_lock); } static -- cgit v1.2.1 From acfdf5c383b38f7f4dddae41b97c97f1ae058f49 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 31 Jan 2011 19:20:39 -0500 Subject: nfsd4: acquire only one lease per file Instead of acquiring one lease each time another client opens a file, nfsd can acquire just one lease to represent all of them, and reference count it to determine when to release it. This fixes a regression introduced by c45821d263a8a5109d69a9e8942b8d65bcd5f31a "locks: eliminate fl_mylease callback": after that patch, only the struct file * is used to determine who owns a given lease. But since we recently converted the server to share a single struct file per open, if we acquire multiple leases on the same file from nfsd, it then becomes impossible on unlocking a lease to determine which of those leases (all of whom share the same struct file *) we meant to remove. Thanks to Takashi Iwai for catching a bug in a previous version of this patch. Tested-by: Takashi Iwai Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 95 +++++++++++++++++++++++++++++++---------------------- fs/nfsd/state.h | 5 +-- 2 files changed, 58 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8b6cd3cf4835..54b60bfceb8d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -230,9 +230,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; - dp->dl_vfs_file = find_readable_file(fp); - get_file(dp->dl_vfs_file); - dp->dl_flock = NULL; dp->dl_type = type; dp->dl_stateid.si_boot = boot_time; dp->dl_stateid.si_stateownerid = current_delegid++; @@ -241,8 +238,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); - list_add(&dp->dl_perfile, &fp->fi_delegations); - list_add(&dp->dl_perclnt, &clp->cl_delegations); INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); return dp; } @@ -253,24 +248,18 @@ nfs4_put_delegation(struct nfs4_delegation *dp) if (atomic_dec_and_test(&dp->dl_count)) { dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); - fput(dp->dl_vfs_file); kmem_cache_free(deleg_slab, dp); num_delegations--; } } -/* Remove the associated file_lock first, then remove the delegation. - * lease_modify() is called to remove the FS_LEASE file_lock from - * the i_flock list, eventually calling nfsd's lock_manager - * fl_release_callback. - */ -static void -nfs4_close_delegation(struct nfs4_delegation *dp) +static void nfs4_put_deleg_lease(struct nfs4_file *fp) { - dprintk("NFSD: close_delegation dp %p\n",dp); - /* XXX: do we even need this check?: */ - if (dp->dl_flock) - vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock); + if (atomic_dec_and_test(&fp->fi_delegees)) { + vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); + fp->fi_lease = NULL; + fp->fi_deleg_file = NULL; + } } /* Called under the state lock. */ @@ -282,7 +271,7 @@ unhash_delegation(struct nfs4_delegation *dp) list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_recall_lru); spin_unlock(&recall_lock); - nfs4_close_delegation(dp); + nfs4_put_deleg_lease(dp->dl_file); nfs4_put_delegation(dp); } @@ -2076,6 +2065,7 @@ alloc_init_file(struct inode *ino) fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; + fp->fi_lease = NULL; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); spin_lock(&recall_lock); @@ -2344,26 +2334,26 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) nfsd4_cb_recall(dp); } -/* - * Called from break_lease() with lock_flocks() held. - * Note: we assume break_lease will only call this *once* for any given - * lease. - */ +/* Called from break_lease() with lock_flocks() held. */ static void nfsd_break_deleg_cb(struct file_lock *fl) { - struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; + struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; + struct nfs4_delegation *dp; - BUG_ON(!dp); + BUG_ON(!fp); + /* We assume break_lease is only called once per lease: */ + BUG_ON(fp->fi_had_conflict); /* * We don't want the locks code to timeout the lease for us; - * we'll remove it ourself if the delegation isn't returned + * we'll remove it ourself if a delegation isn't returned * in time: */ fl->fl_break_time = 0; spin_lock(&recall_lock); - dp->dl_file->fi_had_conflict = true; - nfsd_break_one_deleg(dp); + fp->fi_had_conflict = true; + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + nfsd_break_one_deleg(dp); spin_unlock(&recall_lock); } @@ -2455,13 +2445,15 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) static struct nfs4_delegation * find_delegation_file(struct nfs4_file *fp, stateid_t *stid) { - struct nfs4_delegation *dp; + struct nfs4_delegation *dp = NULL; + spin_lock(&recall_lock); list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) { if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) - return dp; + break; } - return NULL; + spin_unlock(&recall_lock); + return dp; } int share_access_to_flags(u32 share_access) @@ -2649,28 +2641,51 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f fl->fl_flags = FL_LEASE; fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)dp; - fl->fl_file = dp->dl_vfs_file; - BUG_ON(!fl->fl_file); + fl->fl_owner = (fl_owner_t)(dp->dl_file); fl->fl_pid = current->tgid; - dp->dl_flock = fl; return fl; } static int nfs4_setlease(struct nfs4_delegation *dp, int flag) { + struct nfs4_file *fp = dp->dl_file; struct file_lock *fl; int status; fl = nfs4_alloc_init_lease(dp, flag); if (!fl) return -ENOMEM; - status = vfs_setlease(dp->dl_vfs_file, fl->fl_type, &fl); + fl->fl_file = find_readable_file(fp); + list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); if (status) { - dp->dl_flock = NULL; + list_del_init(&dp->dl_perclnt); locks_free_lock(fl); return -ENOMEM; } + fp->fi_lease = fl; + fp->fi_deleg_file = fl->fl_file; + get_file(fp->fi_deleg_file); + atomic_set(&fp->fi_delegees, 1); + list_add(&dp->dl_perfile, &fp->fi_delegations); + return 0; +} + +static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) +{ + struct nfs4_file *fp = dp->dl_file; + + if (!fp->fi_lease) + return nfs4_setlease(dp, flag); + spin_lock(&recall_lock); + if (fp->fi_had_conflict) { + spin_unlock(&recall_lock); + return -EAGAIN; + } + atomic_inc(&fp->fi_delegees); + list_add(&dp->dl_perfile, &fp->fi_delegations); + spin_unlock(&recall_lock); + list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); return 0; } @@ -2715,7 +2730,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta dp = alloc_init_deleg(sop->so_client, stp, fh, flag); if (dp == NULL) goto out_no_deleg; - status = nfs4_setlease(dp, flag); + status = nfs4_set_delegation(dp, flag); if (status) goto out_free; @@ -2731,7 +2746,7 @@ out: open->op_delegate_type = flag; return; out_free: - unhash_delegation(dp); + nfs4_put_delegation(dp); out_no_deleg: flag = NFS4_OPEN_DELEGATE_NONE; goto out; @@ -3139,7 +3154,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, goto out; renew_client(dp->dl_client); if (filpp) { - *filpp = find_readable_file(dp->dl_file); + *filpp = dp->dl_file->fi_deleg_file; BUG_ON(!*filpp); } } else { /* open or lock stateid */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 3074656ba7bf..2d31224b07bf 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -83,8 +83,6 @@ struct nfs4_delegation { atomic_t dl_count; /* ref count */ struct nfs4_client *dl_client; struct nfs4_file *dl_file; - struct file *dl_vfs_file; - struct file_lock *dl_flock; u32 dl_type; time_t dl_time; /* For recall: */ @@ -379,6 +377,9 @@ struct nfs4_file { */ atomic_t fi_readers; atomic_t fi_writers; + struct file *fi_deleg_file; + struct file_lock *fi_lease; + atomic_t fi_delegees; struct inode *fi_inode; u32 fi_id; /* used with stateowner->so_id * for stateid_hashtbl hash */ -- cgit v1.2.1 From 83f6b0c18204f68961f58b9f69e5dba0d36056a2 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 6 Feb 2011 16:46:30 -0500 Subject: nfsd: break lease on unlink due to rename 4795bb37effb7b8fe77e2d2034545d062d3788a8 "nfsd: break lease on unlink, link, and rename", only broke the lease on the file that was being renamed, and didn't handle the case where the target path refers to an already-existing file that will be unlinked by a rename--in that case the target file should have any leases broken as well. Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 30c73f8a5791..da1d9701f8e4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1742,6 +1742,13 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out_dput_new; host_err = nfsd_break_lease(odentry->d_inode); + if (host_err) + goto out_drop_write; + if (ndentry->d_inode) { + host_err = nfsd_break_lease(ndentry->d_inode); + if (host_err) + goto out_drop_write; + } if (host_err) goto out_drop_write; host_err = vfs_rename(fdir, odentry, tdir, ndentry); -- cgit v1.2.1 From eb14ab8ed24a0405fd056068b28c33a1cd846024 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 10 Feb 2011 12:35:00 -0500 Subject: Btrfs: fix page->private races There is a race where btrfs_releasepage can drop the page->private contents just as alloc_extent_buffer is setting up pages for metadata. Because of how the Btrfs page flags work, this results in us skipping the crc on the page during IO. This patch sovles the race by waiting until after the extent buffer is inserted into the radix tree before it sets page private. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 8 ++++++-- fs/btrfs/extent_io.c | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b36eeef19194..3e1ea3e0477e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -359,10 +359,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) + if (page->private == EXTENT_PAGE_PRIVATE) { + WARN_ON(1); goto out; - if (!page->private) + } + if (!page->private) { + WARN_ON(1); goto out; + } len = page->private >> 2; WARN_ON(len == 0); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8862dda46ff6..0418bf2c9757 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1946,6 +1946,7 @@ void set_page_extent_mapped(struct page *page) static void set_page_extent_head(struct page *page, unsigned long len) { + WARN_ON(!PagePrivate(page)); set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); } @@ -3195,7 +3196,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, } if (!PageUptodate(p)) uptodate = 0; - unlock_page(p); + + /* + * see below about how we avoid a nasty race with release page + * and why we unlock later + */ + if (i != 0) + unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -3219,9 +3226,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + + /* + * there is a race where release page may have + * tried to find this extent buffer in the radix + * but failed. It will tell the VM it is safe to + * reclaim the, and it will clear the page private bit. + * We must make sure to set the page private bit properly + * after the extent buffer is in the radix tree so + * it doesn't get lost + */ + set_page_extent_mapped(eb->first_page); + set_page_extent_head(eb->first_page, eb->len); + if (!page0) + unlock_page(eb->first_page); return eb; free_eb: + if (eb->first_page && !page0) + unlock_page(eb->first_page); + if (!atomic_dec_and_test(&eb->refs)) return exists; btrfs_release_extent_buffer(eb); @@ -3272,10 +3296,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, continue; lock_page(page); + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); if (i == 0) set_page_extent_head(page, eb->len); - else - set_page_private(page, EXTENT_PAGE_PRIVATE); clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); @@ -3465,6 +3490,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); + + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); + if (i == 0) + set_page_extent_head(page, eb->len); + if (inc_all_pages) page_cache_get(page); if (!PageUptodate(page)) { -- cgit v1.2.1 From e3f24cc521cb7ba60ac137abd1939e4e03435e80 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 14 Feb 2011 12:52:08 -0500 Subject: Btrfs: don't release pages when we can't clear the uptodate bits Btrfs tracks uptodate state in an rbtree as well as in the page bits. This is supposed to enable us to use block sizes other than the page size, but there are a few parts still missing before that completely works. But, our readpage routine trusts this additional range based tracking of uptodateness, much in the same way the buffer head up to date bits are trusted for the other filesystems. The problem is that sometimes we need to allocate memory in order to split records in the rbtree, even when we are just clearing bits. This can be difficult when our clearing function is called GFP_ATOMIC, which can happen in the releasepage path. So, what happens today looks like this: releasepage called with GFP_ATOMIC btrfs_releasepage calls clear_extent_bit clear_extent_bit fails to allocate ram, leaving the up to date bit set btrfs_releasepage returns success The end result is the page being gone, but btrfs thinking the range is up to date. Later on if someone tries to read that same page, the btrfs readpage code will return immediately thinking the page is already up to date. This commit fixes things to fail the releasepage when we can't clear the extent state bits. It covers both data pages and metadata tree blocks. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0418bf2c9757..e7aeba242701 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2822,9 +2822,17 @@ int try_release_extent_state(struct extent_map_tree *map, * at this point we can safely clear everything except the * locked bit and the nodatasum bit */ - clear_extent_bit(tree, start, end, + ret = clear_extent_bit(tree, start, end, ~(EXTENT_LOCKED | EXTENT_NODATASUM), 0, 0, NULL, mask); + + /* if clear_extent_bit failed for enomem reasons, + * we can't allow the release to continue. + */ + if (ret < 0) + ret = 0; + else + ret = 1; } return ret; } -- cgit v1.2.1 From 6848ad6461e551849ba3c32d945d4f45e96453a6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 14 Feb 2011 16:00:03 -0500 Subject: Btrfs: Fix balance panic Mark the cloned backref_node as checked in clone_backref_node() Signed-off-by: Yan, Zheng Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1f5556acb530..0825e4ed9447 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, new_node->bytenr = dest->node->start; new_node->level = node->level; new_node->lowest = node->lowest; + new_node->checked = 1; new_node->root = dest; if (!node->lowest) { -- cgit v1.2.1 From 51788b1bdd0d68345bab0af4301e7fa429277228 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Mon, 14 Feb 2011 16:04:23 -0500 Subject: btrfs: prevent heap corruption in btrfs_ioctl_space_info() Commit bf5fc093c5b625e4259203f1cee7ca73488a5620 refactored btrfs_ioctl_space_info() and introduced several security issues. space_args.space_slots is an unsigned 64-bit type controlled by a possibly unprivileged caller. The comparison as a signed int type allows providing values that are treated as negative and cause the subsequent allocation size calculation to wrap, or be truncated to 0. By providing a size that's truncated to 0, kmalloc() will return ZERO_SIZE_PTR. It's also possible to provide a value smaller than the slot count. The subsequent loop ignores the allocation size when copying data in, resulting in a heap overflow or write to ZERO_SIZE_PTR. The fix changes the slot count type and comparison typecast to u64, which prevents truncation or signedness errors, and also ensures that we don't copy more data than we've allocated in the subsequent loop. Note that zero-size allocations are no longer possible since there is already an explicit check for space_args.space_slots being 0 and truncation of this value is no longer an issue. Signed-off-by: Dan Rosenberg Signed-off-by: Josef Bacik Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 02d224e8c83f..be2d4f6aaa5e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2208,7 +2208,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) int num_types = 4; int alloc_size; int ret = 0; - int slot_count = 0; + u64 slot_count = 0; int i, c; if (copy_from_user(&space_args, @@ -2247,7 +2247,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) goto out; } - slot_count = min_t(int, space_args.space_slots, slot_count); + slot_count = min_t(u64, space_args.space_slots, slot_count); alloc_size = sizeof(*dest) * slot_count; @@ -2267,6 +2267,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) for (i = 0; i < num_types; i++) { struct btrfs_space_info *tmp; + if (!slot_count) + break; + info = NULL; rcu_read_lock(); list_for_each_entry_rcu(tmp, &root->fs_info->space_info, @@ -2288,7 +2291,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; + slot_count--; } + if (!slot_count) + break; } up_read(&info->groups_sem); } -- cgit v1.2.1 From 67100f255dba284bcbb5ce795355dad1cff35658 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 6 Feb 2011 19:58:21 +0000 Subject: Btrfs - Fix memory leak in btrfs_init_new_device() Memory allocated by calling kstrdup() should be freed. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7cad59353b09..dadaaa8005c8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1603,12 +1603,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = find_next_devid(root, &device->devid); if (ret) { + kfree(device->name); kfree(device); goto error; } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { + kfree(device->name); kfree(device); ret = PTR_ERR(trans); goto error; -- cgit v1.2.1 From c26a920373a983b52223eed5a13b97404d8b4158 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 14 Feb 2011 00:45:29 +0000 Subject: Btrfs: check return value of alloc_extent_map() I add the check on the return value of alloc_extent_map() to several places. In addition, alloc_extent_map() returns only the address or NULL. Therefore, check by IS_ERR() is unnecessary. So, I remove IS_ERR() checking. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_map.c | 4 ++-- fs/btrfs/file.c | 1 + fs/btrfs/inode.c | 3 +++ 4 files changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 565e22d77b1b..a7aaa10c5302 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6584,7 +6584,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, u64 end = start + extent_key->offset - 1; em = alloc_extent_map(GFP_NOFS); - BUG_ON(!em || IS_ERR(em)); + BUG_ON(!em); em->start = start; em->len = extent_key->offset; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b0e1fce12530..2b6c12e983b3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -51,8 +51,8 @@ struct extent_map *alloc_extent_map(gfp_t mask) { struct extent_map *em; em = kmem_cache_alloc(extent_map_cache, mask); - if (!em || IS_ERR(em)) - return em; + if (!em) + return NULL; em->in_tree = 0; em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b0ff34b96607..65338a1d14ad 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -185,6 +185,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = alloc_extent_map(GFP_NOFS); if (!split2) split2 = alloc_extent_map(GFP_NOFS); + BUG_ON(!split || !split2); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c9bc0afdbfc6..8d392ed73d57 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -644,6 +644,7 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -820,6 +821,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(ret); em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -1169,6 +1171,7 @@ out_check: struct extent_map_tree *em_tree; em_tree = &BTRFS_I(inode)->extent_tree; em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = cur_offset; em->orig_start = em->start; em->len = num_bytes; -- cgit v1.2.1 From 844a391799c25d9ba85cbce33e4697db06083ec6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 00:38:26 -0500 Subject: nothing in do_follow_link() is going to see RCU Signed-off-by: Al Viro --- fs/namei.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index ec4b2d0190a8..9ce6d272f4f2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -668,9 +668,6 @@ force_reval_path(struct path *path, struct nameidata *nd) return 0; if (!status) { - /* Don't d_invalidate in rcu-walk mode */ - if (nameidata_drop_rcu(nd)) - return -ECHILD; d_invalidate(dentry); status = -ESTALE; } @@ -777,6 +774,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) int error; struct dentry *dentry = link->dentry; + BUG_ON(nd->flags & LOOKUP_RCU); + touch_atime(link->mnt, dentry); nd_set_link(nd, NULL); @@ -811,6 +810,11 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd) { void *cookie; int err = -ELOOP; + + /* We drop rcu-walk here */ + if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) + return -ECHILD; + if (current->link_count >= MAX_NESTED_LINKS) goto loop; if (current->total_link_count >= 40) @@ -1419,9 +1423,6 @@ exec_again: goto out_dput; if (inode->i_op->follow_link) { - /* We commonly drop rcu-walk here */ - if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) - return -ECHILD; BUG_ON(inode != next.dentry->d_inode); err = do_follow_link(&next, nd); if (err) @@ -1467,8 +1468,6 @@ last_component: break; if (inode && unlikely(inode->i_op->follow_link) && (lookup_flags & LOOKUP_FOLLOW)) { - if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) - return -ECHILD; BUG_ON(inode != next.dentry->d_inode); err = do_follow_link(&next, nd); if (err) -- cgit v1.2.1 From 24643087e748bf192f1182766716e522dc1c972f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 01:26:22 -0500 Subject: in do_lookup() split RCU and non-RCU cases of need_revalidate and use unlikely() instead of gotos, for fsck sake... Signed-off-by: Al Viro --- fs/namei.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9ce6d272f4f2..7609bacc7046 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1259,9 +1259,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, return -ECHILD; nd->seq = seq; - if (dentry->d_flags & DCACHE_OP_REVALIDATE) - goto need_revalidate; -done2: + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { + dentry = do_revalidate(dentry, nd); + if (!dentry) + goto need_lookup; + if (IS_ERR(dentry)) + goto fail; + if (!(nd->flags & LOOKUP_RCU)) + goto done; + } path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, false))) @@ -1274,8 +1280,13 @@ done2: if (!dentry) goto need_lookup; found: - if (dentry->d_flags & DCACHE_OP_REVALIDATE) - goto need_revalidate; + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { + dentry = do_revalidate(dentry, nd); + if (!dentry) + goto need_lookup; + if (IS_ERR(dentry)) + goto fail; + } done: path->mnt = mnt; path->dentry = dentry; @@ -1317,16 +1328,6 @@ need_lookup: mutex_unlock(&dir->i_mutex); goto found; -need_revalidate: - dentry = do_revalidate(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; - if (nd->flags & LOOKUP_RCU) - goto done2; - goto done; - fail: return PTR_ERR(dentry); } -- cgit v1.2.1 From f5e1c1c1afc1d979e2ac6a24cc99ba7143639f4d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 01:32:55 -0500 Subject: split do_revalidate() into RCU and non-RCU cases fixing oopsen in lookup_one_len() Signed-off-by: Al Viro --- fs/namei.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 7609bacc7046..a98f7f141780 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -592,12 +592,10 @@ static int d_revalidate(struct dentry *dentry, struct nameidata *nd) return status; } -static inline struct dentry * +static struct dentry * do_revalidate(struct dentry *dentry, struct nameidata *nd) { - int status; - - status = d_revalidate(dentry, nd); + int status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { /* * The dentry failed validation. @@ -606,24 +604,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) * to return a fail status. */ if (status < 0) { - /* If we're in rcu-walk, we don't have a ref */ - if (!(nd->flags & LOOKUP_RCU)) - dput(dentry); + dput(dentry); dentry = ERR_PTR(status); - - } else { - /* Don't d_invalidate in rcu-walk mode */ - if (nameidata_dentry_drop_rcu_maybe(nd, dentry)) - return ERR_PTR(-ECHILD); - if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } + } else if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; } } return dentry; } +static inline struct dentry * +do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd) +{ + int status = dentry->d_op->d_revalidate(dentry, nd); + if (likely(status > 0)) + return dentry; + if (status == -ECHILD) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return ERR_PTR(-ECHILD); + return do_revalidate(dentry, nd); + } + if (status < 0) + return ERR_PTR(status); + /* Don't d_invalidate in rcu-walk mode */ + if (nameidata_dentry_drop_rcu(nd, dentry)) + return ERR_PTR(-ECHILD); + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + } + return dentry; +} + static inline int need_reval_dot(struct dentry *dentry) { if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) @@ -1260,7 +1273,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, nd->seq = seq; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate(dentry, nd); + dentry = do_revalidate_rcu(dentry, nd); if (!dentry) goto need_lookup; if (IS_ERR(dentry)) -- cgit v1.2.1 From f60aef7ec625236a6366722bb1be7b37596bf0ae Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 01:35:28 -0500 Subject: drop out of RCU in return_reval ... thus killing the need to handle drop-from-RCU in d_revalidate() Signed-off-by: Al Viro --- fs/namei.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index a98f7f141780..10635d329175 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -571,25 +571,9 @@ void release_open_intent(struct nameidata *nd) } } -/* - * Call d_revalidate and handle filesystems that request rcu-walk - * to be dropped. This may be called and return in rcu-walk mode, - * regardless of success or error. If -ECHILD is returned, the caller - * must return -ECHILD back up the path walk stack so path walk may - * be restarted in ref-walk mode. - */ -static int d_revalidate(struct dentry *dentry, struct nameidata *nd) +static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd) { - int status; - - status = dentry->d_op->d_revalidate(dentry, nd); - if (status == -ECHILD) { - if (nameidata_dentry_drop_rcu(nd, dentry)) - return status; - status = dentry->d_op->d_revalidate(dentry, nd); - } - - return status; + return dentry->d_op->d_revalidate(dentry, nd); } static struct dentry * @@ -617,7 +601,7 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) static inline struct dentry * do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd) { - int status = dentry->d_op->d_revalidate(dentry, nd); + int status = d_revalidate(dentry, nd); if (likely(status > 0)) return dentry; if (status == -ECHILD) { @@ -1517,12 +1501,15 @@ return_reval: * We may need to check the cached dentry for staleness. */ if (need_reval_dot(nd->path.dentry)) { + if (nameidata_drop_rcu_last_maybe(nd)) + return -ECHILD; /* Note: we do not d_invalidate() */ err = d_revalidate(nd->path.dentry, nd); if (!err) err = -ESTALE; if (err < 0) break; + return 0; } return_base: if (nameidata_drop_rcu_last_maybe(nd)) -- cgit v1.2.1 From 4e924a4f53a0e1ea060bd50695a12a238b250322 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 01:42:59 -0500 Subject: get rid of nameidata_dentry_drop_rcu() calling nameidata_drop_rcu() can't happen anymore and didn't work right anyway Signed-off-by: Al Viro --- fs/namei.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 10635d329175..9e701e28a329 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -455,14 +455,6 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; - /* - * It can be possible to revalidate the dentry that we started - * the path walk with. force_reval_path may also revalidate the - * dentry already committed to the nameidata. - */ - if (unlikely(parent == dentry)) - return nameidata_drop_rcu(nd); - BUG_ON(!(nd->flags & LOOKUP_RCU)); if (nd->root.mnt) { spin_lock(&fs->lock); -- cgit v1.2.1 From 261cd298a8c363d7985e3482946edb4bfedacf98 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 15 Feb 2011 09:43:32 +0100 Subject: s390: remove task_show_regs task_show_regs used to be a debugging aid in the early bringup days of Linux on s390. /proc//status is a world readable file, it is not a good idea to show the registers of a process. The only correct fix is to remove task_show_regs. Reported-by: Al Viro Signed-off-by: Martin Schwidefsky Signed-off-by: Linus Torvalds --- fs/proc/array.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index df2b703b9d0f..7c99c1cf7e5c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cap(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); -#if defined(CONFIG_S390) - task_show_regs(m, task); -#endif task_context_switch_counts(m, task); return 0; } -- cgit v1.2.1 From 58a69cb47ec6991bf006a3e5d202e8571b0327a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 09:25:31 +0100 Subject: workqueue, freezer: unify spelling of 'freeze' + 'able' to 'freezable' There are two spellings in use for 'freeze' + 'able' - 'freezable' and 'freezeable'. The former is the more prominent one. The latter is mostly used by workqueue and in a few other odd places. Unify the spelling to 'freezable'. Signed-off-by: Tejun Heo Reported-by: Alan Stern Acked-by: "Rafael J. Wysocki" Acked-by: Greg Kroah-Hartman Acked-by: Dmitry Torokhov Cc: David Woodhouse Cc: Alex Dubov Cc: "David S. Miller" Cc: Steven Whitehouse --- fs/gfs2/glock.c | 4 ++-- fs/gfs2/main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 08a8beb152e6..7cd9a5a68d59 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1779,11 +1779,11 @@ int __init gfs2_glock_init(void) #endif glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_FREEZEABLE, 0); + WQ_HIGHPRI | WQ_FREEZABLE, 0); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", - WQ_MEM_RECLAIM | WQ_FREEZEABLE, + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); if (IS_ERR(gfs2_delete_workqueue)) { destroy_workqueue(glock_workqueue); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index ebef7ab6e17e..85ba027d1c4d 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -144,7 +144,7 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", - WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0); + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); if (!gfs_recovery_wq) goto fail_wq; -- cgit v1.2.1 From 3abb17e82f08628b59e20d8cbcb55e2204180f69 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 16 Feb 2011 08:56:55 -0800 Subject: vfs: fix BUG_ON() in fs/namei.c:1461 When Al moved the nameidata_dentry_drop_rcu_maybe() call into the do_follow_link function in commit 844a391799c2 ("nothing in do_follow_link() is going to see RCU"), he mistakenly left the BUG_ON(inode != path->dentry->d_inode); behind. Which would otherwise be ok, but that BUG_ON() really needs to be _after_ dropping RCU, since the dentry isn't necessarily stable otherwise. So complete the code movement in that commit, and move the BUG_ON() into do_follow_link() too. This means that we need to pass in 'inode' as an argument (just for this one use), but that's a small thing. And eventually we may be confident enough in our path lookup that we can just remove the BUG_ON() and the unnecessary inode argument. Reported-and-tested-by: Eric Dumazet Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/namei.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9e701e28a329..0087cf9c2c6b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -795,7 +795,7 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) * Without that kind of total limit, nasty chains of consecutive * symlinks can cause almost arbitrarily long lookups. */ -static inline int do_follow_link(struct path *path, struct nameidata *nd) +static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd) { void *cookie; int err = -ELOOP; @@ -803,6 +803,7 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd) /* We drop rcu-walk here */ if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) return -ECHILD; + BUG_ON(inode != path->dentry->d_inode); if (current->link_count >= MAX_NESTED_LINKS) goto loop; @@ -1413,8 +1414,7 @@ exec_again: goto out_dput; if (inode->i_op->follow_link) { - BUG_ON(inode != next.dentry->d_inode); - err = do_follow_link(&next, nd); + err = do_follow_link(inode, &next, nd); if (err) goto return_err; nd->inode = nd->path.dentry->d_inode; @@ -1458,8 +1458,7 @@ last_component: break; if (inode && unlikely(inode->i_op->follow_link) && (lookup_flags & LOOKUP_FOLLOW)) { - BUG_ON(inode != next.dentry->d_inode); - err = do_follow_link(&next, nd); + err = do_follow_link(inode, &next, nd); if (err) goto return_err; nd->inode = nd->path.dentry->d_inode; -- cgit v1.2.1 From 47c85291d3dd1a51501555000b90f8e281a0458e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Feb 2011 13:08:35 +1100 Subject: nfsd: correctly handle return value from nfsd_map_name_to_* These functions return an nfs status, not a host_err. So don't try to convert before returning. This is a regression introduced by 3c726023402a2f3b28f49b9d90ebf9e71151157d; I fixed up two of the callers, but missed these two. Cc: stable@kernel.org Reported-by: Herbert Poetzl Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 956629b9cdc9..1275b8655070 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -317,8 +317,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); - if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) - goto out_nfserr; + if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + return status; iattr->ia_valid |= ATTR_UID; } if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { @@ -328,8 +328,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); - if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) - goto out_nfserr; + if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + return status; iattr->ia_valid |= ATTR_GID; } if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { -- cgit v1.2.1 From e51900f7d38cbcfb481d84567fd92540e7e1d23a Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Wed, 16 Feb 2011 18:11:53 -0500 Subject: block: revert block_dev read-only check This reverts commit 75f1dc0d076d ("block: check bdev_read_only() from blkdev_get()"). That commit added stricter checking to make sure devices that were being used read-only were actually opened in that mode. It turns out that the change breaks a bunch of kernel code that opens block devices. Affected systems include dm, md, and the loop device. Because strict checking for read-only opens of block devices was not done before this, the code that opens the devices was opening them read-write even if they were being used read-only. Auditing all that code will take time, and new userspace packages for dm, mdadm, etc. will also be required. Signed-off-by: Chuck Ebbert Signed-off-by: Linus Torvalds --- fs/block_dev.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 333a7bb4cb9c..4fb8a3431531 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1215,12 +1215,6 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) res = __blkdev_get(bdev, mode, 0); - /* __blkdev_get() may alter read only status, check it afterwards */ - if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { - __blkdev_put(bdev, mode, 0); - res = -EACCES; - } - if (whole) { /* finish claiming */ mutex_lock(&bdev->bd_mutex); @@ -1298,6 +1292,11 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, if (err) return ERR_PTR(err); + if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, mode); + return ERR_PTR(-EACCES); + } + return bdev; } EXPORT_SYMBOL(blkdev_get_by_path); -- cgit v1.2.1 From fa7ea87a057958a8b7926c1a60a3ca6d696328ed Mon Sep 17 00:00:00 2001 From: Timo Warns Date: Thu, 17 Feb 2011 22:27:40 +0100 Subject: fs/partitions: Validate map_count in Mac partition tables Validate number of blocks in map and remove redundant variable. Signed-off-by: Timo Warns Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/partitions/mac.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index 68d6a216ee79..11f688bd76c5 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c @@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len) int mac_partition(struct parsed_partitions *state) { - int slot = 1; Sector sect; unsigned char *data; - int blk, blocks_in_map; + int slot, blocks_in_map; unsigned secsize; #ifdef CONFIG_PPC_PMAC int found_root = 0; @@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; /* not a MacOS disk */ } - strlcat(state->pp_buf, " [mac]", PAGE_SIZE); blocks_in_map = be32_to_cpu(part->map_count); - for (blk = 1; blk <= blocks_in_map; ++blk) { - int pos = blk * secsize; + if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { + put_dev_sector(sect); + return 0; + } + strlcat(state->pp_buf, " [mac]", PAGE_SIZE); + for (slot = 1; slot <= blocks_in_map; ++slot) { + int pos = slot * secsize; put_dev_sector(sect); data = read_part_sector(state, pos/512, §); if (!data) @@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state) } if (goodness > found_root_goodness) { - found_root = blk; + found_root = slot; found_root_goodness = goodness; } } #endif /* CONFIG_PPC_PMAC */ - - ++slot; } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) -- cgit v1.2.1