summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-10-17 09:58:25 +0200
committerIngo Molnar <mingo@elte.hu>2009-10-17 09:58:25 +0200
commitbb3c3e807140816b5f5fd4840473ee52a916ad4f (patch)
tree9e8a69d266a7df86ca16177eefffab4b4e910753 /fs
parent595c36490deb49381dc51231a3d5e6b66786ed27 (diff)
parent012abeea669ea49636cf952d13298bb68654146a (diff)
downloadlinux-next-bb3c3e807140816b5f5fd4840473ee52a916ad4f.tar.gz
Merge commit 'v2.6.32-rc5' into perf/probes
Conflicts: kernel/trace/trace_event_profile.c Merge reason: update to -rc5 and resolve conflict. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig9
-rw-r--r--fs/9p/Makefile3
-rw-r--r--fs/9p/cache.c474
-rw-r--r--fs/9p/cache.h176
-rw-r--r--fs/9p/v9fs.c196
-rw-r--r--fs/9p/v9fs.h13
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_addr.c88
-rw-r--r--fs/9p/vfs_file.c25
-rw-r--r--fs/9p/vfs_inode.c61
-rw-r--r--fs/9p/vfs_super.c16
-rw-r--r--fs/adfs/inode.c7
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c46
-rw-r--r--fs/befs/linuxvfs.c7
-rw-r--r--fs/binfmt_elf.c52
-rw-r--r--fs/binfmt_elf_fdpic.c17
-rw-r--r--fs/binfmt_flat.c22
-rw-r--r--fs/bio.c49
-rw-r--r--fs/block_dev.c140
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c329
-rw-r--r--fs/btrfs/async-thread.h18
-rw-r--r--fs/btrfs/btrfs_inode.h17
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.c6
-rw-r--r--fs/btrfs/ctree.h114
-rw-r--r--fs/btrfs/dir-item.c47
-rw-r--r--fs/btrfs/disk-io.c281
-rw-r--r--fs/btrfs/export.c133
-rw-r--r--fs/btrfs/extent-tree.c2207
-rw-r--r--fs/btrfs/extent_io.c446
-rw-r--r--fs/btrfs/extent_io.h43
-rw-r--r--fs/btrfs/extent_map.c103
-rw-r--r--fs/btrfs/extent_map.h5
-rw-r--r--fs/btrfs/file.c114
-rw-r--r--fs/btrfs/free-space-cache.c36
-rw-r--r--fs/btrfs/inode-item.c4
-rw-r--r--fs/btrfs/inode-map.c93
-rw-r--r--fs/btrfs/inode.c987
-rw-r--r--fs/btrfs/ioctl.c404
-rw-r--r--fs/btrfs/ioctl.h3
-rw-r--r--fs/btrfs/ordered-data.c132
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/orphan.c20
-rw-r--r--fs/btrfs/relocation.c284
-rw-r--r--fs/btrfs/root-tree.c138
-rw-r--r--fs/btrfs/super.c10
-rw-r--r--fs/btrfs/transaction.c93
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-log.c81
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/volumes.c121
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/buffer.c20
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifsfs.c96
-rw-r--r--fs/cifs/cifsglob.h21
-rw-r--r--fs/cifs/cifsproto.h11
-rw-r--r--fs/cifs/cifssmb.c1
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/dir.c64
-rw-r--r--fs/cifs/file.c137
-rw-r--r--fs/cifs/inode.c53
-rw-r--r--fs/cifs/misc.c34
-rw-r--r--fs/cifs/readdir.c4
-rw-r--r--fs/cifs/transport.c50
-rw-r--r--fs/coda/coda_int.h1
-rw-r--r--fs/coda/psdev.c1
-rw-r--r--fs/compat.c24
-rw-r--r--fs/dlm/lowcomms.c36
-rw-r--r--fs/drop_caches.c4
-rw-r--r--fs/ecryptfs/Kconfig5
-rw-r--r--fs/ecryptfs/crypto.c39
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/keystore.c39
-rw-r--r--fs/ecryptfs/kthread.c24
-rw-r--r--fs/ecryptfs/main.c10
-rw-r--r--fs/ecryptfs/mmap.c4
-rw-r--r--fs/ecryptfs/read_write.c32
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/exec.c114
-rw-r--r--fs/exofs/super.c6
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext3/inode.c3
-rw-r--r--fs/ext3/super.c13
-rw-r--r--fs/ext4/Kconfig14
-rw-r--r--fs/ext4/ext4.h54
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c444
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/inode.c582
-rw-r--r--fs/ext4/mballoc.c305
-rw-r--r--fs/ext4/mballoc.h35
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c20
-rw-r--r--fs/ext4/namei.c3
-rw-r--r--fs/ext4/super.c130
-rw-r--r--fs/fat/fat.h2
-rw-r--r--fs/fat/inode.c34
-rw-r--r--fs/fat/misc.c8
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c108
-rw-r--r--fs/file.c1
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fs-writeback.c165
-rw-r--r--fs/fuse/dir.c14
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/inode.c11
-rw-r--r--fs/gfs2/aops.c3
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/ops_inode.c1
-rw-r--r--fs/hfs/mdb.c6
-rw-r--r--fs/hfsplus/super.c6
-rw-r--r--fs/hugetlbfs/inode.c45
-rw-r--r--fs/inode.c89
-rw-r--r--fs/internal.h1
-rw-r--r--fs/ioctl.c9
-rw-r--r--fs/isofs/inode.c8
-rw-r--r--fs/jbd2/checkpoint.c7
-rw-r--r--fs/jbd2/commit.c59
-rw-r--r--fs/jbd2/journal.c198
-rw-r--r--fs/jfs/super.c9
-rw-r--r--fs/libfs.c13
-rw-r--r--fs/lockd/xdr.c1
-rw-r--r--fs/lockd/xdr4.c1
-rw-r--r--fs/namespace.c77
-rw-r--r--fs/ncpfs/inode.c12
-rw-r--r--fs/ncpfs/ioctl.c6
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/client.c15
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/fscache.c25
-rw-r--r--fs/nfs/fscache.h6
-rw-r--r--fs/nfs/inode.c54
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4namespace.c12
-rw-r--r--fs/nfs/nfs4proc.c1
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4xdr.c1
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c102
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nilfs2/btnode.c1
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/inode.c1
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nls/nls_base.c11
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/super.c10
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/netdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c9
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlm/dlmthread.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c1
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/super.c3
-rw-r--r--fs/ocfs2/symlink.c1
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/omfs/omfs.h4
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/kcore.c1
-rw-r--r--fs/proc/meminfo.c9
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/uptime.c7
-rw-r--r--fs/ramfs/file-nommu.c18
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/romfs/storage.c4
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/select.c1
-rw-r--r--fs/seq_file.c74
-rw-r--r--fs/smbfs/inode.c10
-rw-r--r--fs/super.c67
-rw-r--r--fs/sysfs/bin.c4
-rw-r--r--fs/sysfs/dir.c3
-rw-r--r--fs/sysfs/file.c14
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c39
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c41
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c59
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c36
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c3
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/xfs_inode.c4
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_itable.c21
-rw-r--r--fs/xfs/xfs_vnodeops.c6
212 files changed, 7542 insertions, 4371 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 74e0723e90bc..795233702a4e 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -8,3 +8,12 @@ config 9P_FS
See <http://v9fs.sf.net> for more information.
If unsure, say N.
+
+config 9P_FSCACHE
+ bool "Enable 9P client caching support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
+ help
+ Choose Y here to enable persistent, read-only local
+ caching support for 9p clients using FS-Cache
+
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index bc7f0d1551e6..1a940ec7af61 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,5 +8,6 @@ obj-$(CONFIG_9P_FS) := 9p.o
vfs_dir.o \
vfs_dentry.o \
v9fs.o \
- fid.o \
+ fid.o
+9p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
new file mode 100644
index 000000000000..51c94e26a346
--- /dev/null
+++ b/fs/9p/cache.c
@@ -0,0 +1,474 @@
+/*
+ * V9FS cache definitions.
+ *
+ * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#include <linux/jiffies.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+
+#include "v9fs.h"
+#include "cache.h"
+
+#define CACHETAG_LEN 11
+
+struct kmem_cache *vcookie_cache;
+
+struct fscache_netfs v9fs_cache_netfs = {
+ .name = "9p",
+ .version = 0,
+};
+
+static void init_once(void *foo)
+{
+ struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
+ vcookie->fscache = NULL;
+ vcookie->qid = NULL;
+ inode_init_once(&vcookie->inode);
+}
+
+/**
+ * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
+ * vcookie to inode mapping
+ *
+ * Returns 0 on success.
+ */
+
+static int v9fs_init_vcookiecache(void)
+{
+ vcookie_cache = kmem_cache_create("vcookie_cache",
+ sizeof(struct v9fs_cookie),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once);
+ if (!vcookie_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * v9fs_destroy_vcookiecache - destroy the cache of vcookies
+ *
+ */
+
+static void v9fs_destroy_vcookiecache(void)
+{
+ kmem_cache_destroy(vcookie_cache);
+}
+
+int __v9fs_cache_register(void)
+{
+ int ret;
+ ret = v9fs_init_vcookiecache();
+ if (ret < 0)
+ return ret;
+
+ return fscache_register_netfs(&v9fs_cache_netfs);
+}
+
+void __v9fs_cache_unregister(void)
+{
+ v9fs_destroy_vcookiecache();
+ fscache_unregister_netfs(&v9fs_cache_netfs);
+}
+
+/**
+ * v9fs_random_cachetag - Generate a random tag to be associated
+ * with a new cache session.
+ *
+ * The value of jiffies is used for a fairly randomly cache tag.
+ */
+
+static
+int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
+{
+ v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
+ if (!v9ses->cachetag)
+ return -ENOMEM;
+
+ return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
+}
+
+static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ struct v9fs_session_info *v9ses;
+ uint16_t klen = 0;
+
+ v9ses = (struct v9fs_session_info *)cookie_netfs_data;
+ P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
+ buffer, bufmax);
+
+ if (v9ses->cachetag)
+ klen = strlen(v9ses->cachetag);
+
+ if (klen > bufmax)
+ return 0;
+
+ memcpy(buffer, v9ses->cachetag, klen);
+ P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+ return klen;
+}
+
+const struct fscache_cookie_def v9fs_cache_session_index_def = {
+ .name = "9P.session",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = v9fs_cache_session_get_key,
+};
+
+void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
+{
+ /* If no cache session tag was specified, we generate a random one. */
+ if (!v9ses->cachetag)
+ v9fs_random_cachetag(v9ses);
+
+ v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
+ &v9fs_cache_session_index_def,
+ v9ses);
+ P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
+ v9ses->fscache);
+}
+
+void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
+{
+ P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
+ v9ses->fscache);
+ fscache_relinquish_cookie(v9ses->fscache, 0);
+ v9ses->fscache = NULL;
+}
+
+
+static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct v9fs_cookie *vcookie = cookie_netfs_data;
+ memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
+ vcookie->qid->path);
+ return sizeof(vcookie->qid->path);
+}
+
+static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
+ uint64_t *size)
+{
+ const struct v9fs_cookie *vcookie = cookie_netfs_data;
+ *size = i_size_read(&vcookie->inode);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
+ *size);
+}
+
+static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen)
+{
+ const struct v9fs_cookie *vcookie = cookie_netfs_data;
+ memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
+ vcookie->qid->version);
+ return sizeof(vcookie->qid->version);
+}
+
+static enum
+fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
+ const void *buffer,
+ uint16_t buflen)
+{
+ const struct v9fs_cookie *vcookie = cookie_netfs_data;
+
+ if (buflen != sizeof(vcookie->qid->version))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ if (memcmp(buffer, &vcookie->qid->version,
+ sizeof(vcookie->qid->version)))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
+{
+ struct v9fs_cookie *vcookie = cookie_netfs_data;
+ struct pagevec pvec;
+ pgoff_t first;
+ int loop, nr_pages;
+
+ pagevec_init(&pvec, 0);
+ first = 0;
+
+ for (;;) {
+ nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
+ first,
+ PAGEVEC_SIZE - pagevec_count(&pvec));
+ if (!nr_pages)
+ break;
+
+ for (loop = 0; loop < nr_pages; loop++)
+ ClearPageFsCache(pvec.pages[loop]);
+
+ first = pvec.pages[nr_pages - 1]->index + 1;
+
+ pvec.nr = nr_pages;
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+const struct fscache_cookie_def v9fs_cache_inode_index_def = {
+ .name = "9p.inode",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .get_key = v9fs_cache_inode_get_key,
+ .get_attr = v9fs_cache_inode_get_attr,
+ .get_aux = v9fs_cache_inode_get_aux,
+ .check_aux = v9fs_cache_inode_check_aux,
+ .now_uncached = v9fs_cache_inode_now_uncached,
+};
+
+void v9fs_cache_inode_get_cookie(struct inode *inode)
+{
+ struct v9fs_cookie *vcookie;
+ struct v9fs_session_info *v9ses;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ vcookie = v9fs_inode2cookie(inode);
+ if (vcookie->fscache)
+ return;
+
+ v9ses = v9fs_inode2v9ses(inode);
+ vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+ &v9fs_cache_inode_index_def,
+ vcookie);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
+ vcookie->fscache);
+}
+
+void v9fs_cache_inode_put_cookie(struct inode *inode)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ if (!vcookie->fscache)
+ return;
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
+ vcookie->fscache);
+
+ fscache_relinquish_cookie(vcookie->fscache, 0);
+ vcookie->fscache = NULL;
+}
+
+void v9fs_cache_inode_flush_cookie(struct inode *inode)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ if (!vcookie->fscache)
+ return;
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
+ vcookie->fscache);
+
+ fscache_relinquish_cookie(vcookie->fscache, 1);
+ vcookie->fscache = NULL;
+}
+
+void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+ struct p9_fid *fid;
+
+ if (!vcookie->fscache)
+ return;
+
+ spin_lock(&vcookie->lock);
+ fid = filp->private_data;
+ if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+ v9fs_cache_inode_flush_cookie(inode);
+ else
+ v9fs_cache_inode_get_cookie(inode);
+
+ spin_unlock(&vcookie->lock);
+}
+
+void v9fs_cache_inode_reset_cookie(struct inode *inode)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+ struct v9fs_session_info *v9ses;
+ struct fscache_cookie *old;
+
+ if (!vcookie->fscache)
+ return;
+
+ old = vcookie->fscache;
+
+ spin_lock(&vcookie->lock);
+ fscache_relinquish_cookie(vcookie->fscache, 1);
+
+ v9ses = v9fs_inode2v9ses(inode);
+ vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+ &v9fs_cache_inode_index_def,
+ vcookie);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
+ inode, old, vcookie->fscache);
+
+ spin_unlock(&vcookie->lock);
+}
+
+int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ struct inode *inode = page->mapping->host;
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ BUG_ON(!vcookie->fscache);
+
+ if (PageFsCache(page)) {
+ if (fscache_check_page_write(vcookie->fscache, page)) {
+ if (!(gfp & __GFP_WAIT))
+ return 0;
+ fscache_wait_on_page_write(vcookie->fscache, page);
+ }
+
+ fscache_uncache_page(vcookie->fscache, page);
+ ClearPageFsCache(page);
+ }
+
+ return 1;
+}
+
+void __v9fs_fscache_invalidate_page(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ BUG_ON(!vcookie->fscache);
+
+ if (PageFsCache(page)) {
+ fscache_wait_on_page_write(vcookie->fscache, page);
+ BUG_ON(!PageLocked(page));
+ fscache_uncache_page(vcookie->fscache, page);
+ ClearPageFsCache(page);
+ }
+}
+
+static void v9fs_vfs_readpage_complete(struct page *page, void *data,
+ int error)
+{
+ if (!error)
+ SetPageUptodate(page);
+
+ unlock_page(page);
+}
+
+/**
+ * __v9fs_readpage_from_fscache - read a page from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+ int ret;
+ const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ if (!vcookie->fscache)
+ return -ENOBUFS;
+
+ ret = fscache_read_or_alloc_page(vcookie->fscache,
+ page,
+ v9fs_vfs_readpage_complete,
+ NULL,
+ GFP_KERNEL);
+ switch (ret) {
+ case -ENOBUFS:
+ case -ENODATA:
+ P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+ return 1;
+ case 0:
+ P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+ return ret;
+ default:
+ P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+ return ret;
+ }
+}
+
+/**
+ * __v9fs_readpages_from_fscache - read multiple pages from cache
+ *
+ * Returns 0 if the pages are in cache and a BIO is submitted,
+ * 1 if the pages are not in cache and -error otherwise.
+ */
+
+int __v9fs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ int ret;
+ const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+ if (!vcookie->fscache)
+ return -ENOBUFS;
+
+ ret = fscache_read_or_alloc_pages(vcookie->fscache,
+ mapping, pages, nr_pages,
+ v9fs_vfs_readpage_complete,
+ NULL,
+ mapping_gfp_mask(mapping));
+ switch (ret) {
+ case -ENOBUFS:
+ case -ENODATA:
+ P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+ return 1;
+ case 0:
+ BUG_ON(!list_empty(pages));
+ BUG_ON(*nr_pages != 0);
+ P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+ return ret;
+ default:
+ P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+ return ret;
+ }
+}
+
+/**
+ * __v9fs_readpage_to_fscache - write a page to the cache
+ *
+ */
+
+void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+ int ret;
+ const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+
+ P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+ ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
+ P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
+ if (ret != 0)
+ v9fs_uncache_page(inode, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
new file mode 100644
index 000000000000..a94192bfaee8
--- /dev/null
+++ b/fs/9p/cache.h
@@ -0,0 +1,176 @@
+/*
+ * V9FS cache definitions.
+ *
+ * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#ifndef _9P_CACHE_H
+#ifdef CONFIG_9P_FSCACHE
+#include <linux/fscache.h>
+#include <linux/spinlock.h>
+
+extern struct kmem_cache *vcookie_cache;
+
+struct v9fs_cookie {
+ spinlock_t lock;
+ struct inode inode;
+ struct fscache_cookie *fscache;
+ struct p9_qid *qid;
+};
+
+static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
+{
+ return container_of(inode, struct v9fs_cookie, inode);
+}
+
+extern struct fscache_netfs v9fs_cache_netfs;
+extern const struct fscache_cookie_def v9fs_cache_session_index_def;
+extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
+
+extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
+extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
+
+extern void v9fs_cache_inode_get_cookie(struct inode *inode);
+extern void v9fs_cache_inode_put_cookie(struct inode *inode);
+extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
+extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
+extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
+
+extern int __v9fs_cache_register(void);
+extern void __v9fs_cache_unregister(void);
+
+extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
+extern void __v9fs_fscache_invalidate_page(struct page *page);
+extern int __v9fs_readpage_from_fscache(struct inode *inode,
+ struct page *page);
+extern int __v9fs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages);
+extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+
+
+/**
+ * v9fs_cache_register - Register v9fs file system with the cache
+ */
+static inline int v9fs_cache_register(void)
+{
+ return __v9fs_cache_register();
+}
+
+/**
+ * v9fs_cache_unregister - Unregister v9fs from the cache
+ */
+static inline void v9fs_cache_unregister(void)
+{
+ __v9fs_cache_unregister();
+}
+
+static inline int v9fs_fscache_release_page(struct page *page,
+ gfp_t gfp)
+{
+ return __v9fs_fscache_release_page(page, gfp);
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page)
+{
+ __v9fs_fscache_invalidate_page(page);
+}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+ struct page *page)
+{
+ return __v9fs_readpage_from_fscache(inode, page);
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return __v9fs_readpages_from_fscache(inode, mapping, pages,
+ nr_pages);
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+ struct page *page)
+{
+ if (PageFsCache(page))
+ __v9fs_readpage_to_fscache(inode, page);
+}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+ fscache_uncache_page(vcookie->fscache, page);
+ BUG_ON(PageFsCache(page));
+}
+
+static inline void v9fs_vcookie_set_qid(struct inode *inode,
+ struct p9_qid *qid)
+{
+ struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+ spin_lock(&vcookie->lock);
+ vcookie->qid = qid;
+ spin_unlock(&vcookie->lock);
+}
+
+#else /* CONFIG_9P_FSCACHE */
+
+static inline int v9fs_cache_register(void)
+{
+ return 1;
+}
+
+static inline void v9fs_cache_unregister(void) {}
+
+static inline int v9fs_fscache_release_page(struct page *page,
+ gfp_t gfp) {
+ return 1;
+}
+
+static inline void v9fs_fscache_invalidate_page(struct page *page) {}
+
+static inline int v9fs_readpage_from_fscache(struct inode *inode,
+ struct page *page)
+{
+ return -ENOBUFS;
+}
+
+static inline int v9fs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return -ENOBUFS;
+}
+
+static inline void v9fs_readpage_to_fscache(struct inode *inode,
+ struct page *page)
+{}
+
+static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
+{}
+
+static inline void v9fs_vcookie_set_qid(struct inode *inode,
+ struct p9_qid *qid)
+{}
+
+#endif /* CONFIG_9P_FSCACHE */
+#endif /* _9P_CACHE_H */
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f7003cfac63d..cf62b05e296a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -34,21 +34,25 @@
#include <net/9p/transport.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
+#include "cache.h"
+
+static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
+static LIST_HEAD(v9fs_sessionlist);
/*
- * Option Parsing (code inspired by NFS code)
- * NOTE: each transport will parse its own options
- */
+ * Option Parsing (code inspired by NFS code)
+ * NOTE: each transport will parse its own options
+ */
enum {
/* Options that take integer arguments */
Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
/* String options */
- Opt_uname, Opt_remotename, Opt_trans,
+ Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
/* Options that take no arguments */
Opt_nodevmap,
/* Cache options */
- Opt_cache_loose,
+ Opt_cache_loose, Opt_fscache,
/* Access options */
Opt_access,
/* Error token */
@@ -63,8 +67,10 @@ static const match_table_t tokens = {
{Opt_uname, "uname=%s"},
{Opt_remotename, "aname=%s"},
{Opt_nodevmap, "nodevmap"},
- {Opt_cache_loose, "cache=loose"},
+ {Opt_cache, "cache=%s"},
{Opt_cache_loose, "loose"},
+ {Opt_fscache, "fscache"},
+ {Opt_cachetag, "cachetag=%s"},
{Opt_access, "access=%s"},
{Opt_err, NULL}
};
@@ -89,16 +95,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
v9ses->afid = ~0;
v9ses->debug = 0;
v9ses->cache = 0;
+#ifdef CONFIG_9P_FSCACHE
+ v9ses->cachetag = NULL;
+#endif
if (!opts)
return 0;
options = kstrdup(opts, GFP_KERNEL);
- if (!options) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "failed to allocate copy of option string\n");
- return -ENOMEM;
- }
+ if (!options)
+ goto fail_option_alloc;
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -143,16 +149,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_cache_loose:
v9ses->cache = CACHE_LOOSE;
break;
+ case Opt_fscache:
+ v9ses->cache = CACHE_FSCACHE;
+ break;
+ case Opt_cachetag:
+#ifdef CONFIG_9P_FSCACHE
+ v9ses->cachetag = match_strdup(&args[0]);
+#endif
+ break;
+ case Opt_cache:
+ s = match_strdup(&args[0]);
+ if (!s)
+ goto fail_option_alloc;
+
+ if (strcmp(s, "loose") == 0)
+ v9ses->cache = CACHE_LOOSE;
+ else if (strcmp(s, "fscache") == 0)
+ v9ses->cache = CACHE_FSCACHE;
+ else
+ v9ses->cache = CACHE_NONE;
+ kfree(s);
+ break;
case Opt_access:
s = match_strdup(&args[0]);
- if (!s) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "failed to allocate copy"
- " of option argument\n");
- ret = -ENOMEM;
- break;
- }
+ if (!s)
+ goto fail_option_alloc;
+
v9ses->flags &= ~V9FS_ACCESS_MASK;
if (strcmp(s, "user") == 0)
v9ses->flags |= V9FS_ACCESS_USER;
@@ -173,6 +196,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
}
kfree(options);
return ret;
+
+fail_option_alloc:
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "failed to allocate copy of option argument\n");
+ return -ENOMEM;
}
/**
@@ -200,6 +228,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
return ERR_PTR(-ENOMEM);
}
+ spin_lock(&v9fs_sessionlist_lock);
+ list_add(&v9ses->slist, &v9fs_sessionlist);
+ spin_unlock(&v9fs_sessionlist_lock);
+
v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
strcpy(v9ses->uname, V9FS_DEFUSER);
strcpy(v9ses->aname, V9FS_DEFANAME);
@@ -249,6 +281,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
else
fid->uid = ~0;
+#ifdef CONFIG_9P_FSCACHE
+ /* register the session for caching */
+ v9fs_cache_session_get_cookie(v9ses);
+#endif
+
return fid;
error:
@@ -268,8 +305,18 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
v9ses->clnt = NULL;
}
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->fscache) {
+ v9fs_cache_session_put_cookie(v9ses);
+ kfree(v9ses->cachetag);
+ }
+#endif
__putname(v9ses->uname);
__putname(v9ses->aname);
+
+ spin_lock(&v9fs_sessionlist_lock);
+ list_del(&v9ses->slist);
+ spin_unlock(&v9fs_sessionlist_lock);
}
/**
@@ -286,25 +333,132 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
extern int v9fs_error_init(void);
+static struct kobject *v9fs_kobj;
+
+#ifdef CONFIG_9P_FSCACHE
/**
- * v9fs_init - Initialize module
+ * caches_show - list caches associated with a session
+ *
+ * Returns the size of buffer written.
+ */
+
+static ssize_t caches_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t n = 0, count = 0, limit = PAGE_SIZE;
+ struct v9fs_session_info *v9ses;
+
+ spin_lock(&v9fs_sessionlist_lock);
+ list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
+ if (v9ses->cachetag) {
+ n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
+ if (n < 0) {
+ count = n;
+ break;
+ }
+
+ count += n;
+ limit -= n;
+ }
+ }
+
+ spin_unlock(&v9fs_sessionlist_lock);
+ return count;
+}
+
+static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
+#endif /* CONFIG_9P_FSCACHE */
+
+static struct attribute *v9fs_attrs[] = {
+#ifdef CONFIG_9P_FSCACHE
+ &v9fs_attr_cache.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group v9fs_attr_group = {
+ .attrs = v9fs_attrs,
+};
+
+/**
+ * v9fs_sysfs_init - Initialize the v9fs sysfs interface
+ *
+ */
+
+static int v9fs_sysfs_init(void)
+{
+ v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
+ if (!v9fs_kobj)
+ return -ENOMEM;
+
+ if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
+ kobject_put(v9fs_kobj);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/**
+ * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
+ *
+ */
+
+static void v9fs_sysfs_cleanup(void)
+{
+ sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
+ kobject_put(v9fs_kobj);
+}
+
+/**
+ * init_v9fs - Initialize module
*
*/
static int __init init_v9fs(void)
{
+ int err;
printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
/* TODO: Setup list of registered trasnport modules */
- return register_filesystem(&v9fs_fs_type);
+ err = register_filesystem(&v9fs_fs_type);
+ if (err < 0) {
+ printk(KERN_ERR "Failed to register filesystem\n");
+ return err;
+ }
+
+ err = v9fs_cache_register();
+ if (err < 0) {
+ printk(KERN_ERR "Failed to register v9fs for caching\n");
+ goto out_fs_unreg;
+ }
+
+ err = v9fs_sysfs_init();
+ if (err < 0) {
+ printk(KERN_ERR "Failed to register with sysfs\n");
+ goto out_sysfs_cleanup;
+ }
+
+ return 0;
+
+out_sysfs_cleanup:
+ v9fs_sysfs_cleanup();
+
+out_fs_unreg:
+ unregister_filesystem(&v9fs_fs_type);
+
+ return err;
}
/**
- * v9fs_init - shutdown module
+ * exit_v9fs - shutdown module
*
*/
static void __exit exit_v9fs(void)
{
+ v9fs_sysfs_cleanup();
+ v9fs_cache_unregister();
unregister_filesystem(&v9fs_fs_type);
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 38762bf102a9..019f4ccb70c1 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -51,6 +51,7 @@ enum p9_session_flags {
enum p9_cache_modes {
CACHE_NONE,
CACHE_LOOSE,
+ CACHE_FSCACHE,
};
/**
@@ -60,6 +61,8 @@ enum p9_cache_modes {
* @debug: debug level
* @afid: authentication handle
* @cache: cache mode of type &p9_cache_modes
+ * @cachetag: the tag of the cache associated with this session
+ * @fscache: session cookie associated with FS-Cache
* @options: copy of options string given by user
* @uname: string user name to mount hierarchy as
* @aname: mount specifier for remote hierarchy
@@ -68,7 +71,7 @@ enum p9_cache_modes {
* @dfltgid: default numeric groupid to mount hierarchy as
* @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
* @clnt: reference to 9P network client instantiated for this session
- * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug
+ * @slist: reference to list of registered 9p sessions
*
* This structure holds state for each session instance established during
* a sys_mount() .
@@ -84,6 +87,10 @@ struct v9fs_session_info {
unsigned short debug;
unsigned int afid;
unsigned int cache;
+#ifdef CONFIG_9P_FSCACHE
+ char *cachetag;
+ struct fscache_cookie *fscache;
+#endif
char *uname; /* user name to mount as */
char *aname; /* name of remote hierarchy being mounted */
@@ -92,11 +99,9 @@ struct v9fs_session_info {
unsigned int dfltgid; /* default gid for legacy support */
u32 uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
- struct dentry *debugfs_dir;
+ struct list_head slist; /* list of sessions registered with v9fs */
};
-extern struct dentry *v9fs_debugfs_root;
-
struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
char *);
void v9fs_session_close(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f0c7de78e205..3a7560e35865 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,7 +44,13 @@ extern const struct file_operations v9fs_dir_operations;
extern const struct dentry_operations v9fs_dentry_operations;
extern const struct dentry_operations v9fs_cached_dentry_operations;
+#ifdef CONFIG_9P_FSCACHE
+struct inode *v9fs_alloc_inode(struct super_block *sb);
+void v9fs_destroy_inode(struct inode *inode);
+#endif
+
struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+void v9fs_clear_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
int v9fs_dir_release(struct inode *inode, struct file *filp);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 92828281a30b..90e38449f4b3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,6 +38,7 @@
#include "v9fs.h"
#include "v9fs_vfs.h"
+#include "cache.h"
/**
* v9fs_vfs_readpage - read an entire page in from 9P
@@ -52,18 +53,31 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
int retval;
loff_t offset;
char *buffer;
+ struct inode *inode;
+ inode = page->mapping->host;
P9_DPRINTK(P9_DEBUG_VFS, "\n");
+
+ BUG_ON(!PageLocked(page));
+
+ retval = v9fs_readpage_from_fscache(inode, page);
+ if (retval == 0)
+ return retval;
+
buffer = kmap(page);
offset = page_offset(page);
retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
- if (retval < 0)
+ if (retval < 0) {
+ v9fs_uncache_page(inode, page);
goto done;
+ }
memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
flush_dcache_page(page);
SetPageUptodate(page);
+
+ v9fs_readpage_to_fscache(inode, page);
retval = 0;
done:
@@ -72,6 +86,78 @@ done:
return retval;
}
+/**
+ * v9fs_vfs_readpages - read a set of pages from 9P
+ *
+ * @filp: file being read
+ * @mapping: the address space
+ * @pages: list of pages to read
+ * @nr_pages: count of pages to read
+ *
+ */
+
+static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ int ret = 0;
+ struct inode *inode;
+
+ inode = mapping->host;
+ P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+
+ ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
+ if (ret == 0)
+ return ret;
+
+ ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
+ P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret);
+ return ret;
+}
+
+/**
+ * v9fs_release_page - release the private state associated with a page
+ *
+ * Returns 1 if the page can be released, false otherwise.
+ */
+
+static int v9fs_release_page(struct page *page, gfp_t gfp)
+{
+ if (PagePrivate(page))
+ return 0;
+
+ return v9fs_fscache_release_page(page, gfp);
+}
+
+/**
+ * v9fs_invalidate_page - Invalidate a page completely or partially
+ *
+ * @page: structure to page
+ * @offset: offset in the page
+ */
+
+static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+{
+ if (offset == 0)
+ v9fs_fscache_invalidate_page(page);
+}
+
+/**
+ * v9fs_launder_page - Writeback a dirty page
+ * Since the writes go directly to the server, we simply return a 0
+ * here to indicate success.
+ *
+ * Returns 0 on success.
+ */
+
+static int v9fs_launder_page(struct page *page)
+{
+ return 0;
+}
+
const struct address_space_operations v9fs_addr_operations = {
.readpage = v9fs_vfs_readpage,
+ .readpages = v9fs_vfs_readpages,
+ .releasepage = v9fs_release_page,
+ .invalidatepage = v9fs_invalidate_page,
+ .launder_page = v9fs_launder_page,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 68bf2af6c389..3902bf43a088 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/list.h>
+#include <linux/pagemap.h>
#include <asm/uaccess.h>
#include <linux/idr.h>
#include <net/9p/9p.h>
@@ -40,6 +41,7 @@
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "fid.h"
+#include "cache.h"
static const struct file_operations v9fs_cached_file_operations;
@@ -72,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
return err;
}
if (omode & P9_OTRUNC) {
- inode->i_size = 0;
+ i_size_write(inode, 0);
inode->i_blocks = 0;
}
if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
@@ -85,6 +87,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
/* enable cached file options */
if(file->f_op == &v9fs_file_operations)
file->f_op = &v9fs_cached_file_operations;
+
+#ifdef CONFIG_9P_FSCACHE
+ v9fs_cache_inode_set_cookie(inode, file);
+#endif
}
return 0;
@@ -210,6 +216,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
struct p9_client *clnt;
struct inode *inode = filp->f_path.dentry->d_inode;
int origin = *offset;
+ unsigned long pg_start, pg_end;
P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
(int)count, (int)*offset);
@@ -225,7 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
if (count < rsize)
rsize = count;
- n = p9_client_write(fid, NULL, data+total, *offset+total,
+ n = p9_client_write(fid, NULL, data+total, origin+total,
rsize);
if (n <= 0)
break;
@@ -234,14 +241,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
} while (count > 0);
if (total > 0) {
- invalidate_inode_pages2_range(inode->i_mapping, origin,
- origin+total);
+ pg_start = origin >> PAGE_CACHE_SHIFT;
+ pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
+ if (inode->i_mapping && inode->i_mapping->nrpages)
+ invalidate_inode_pages2_range(inode->i_mapping,
+ pg_start, pg_end);
*offset += total;
- }
-
- if (*offset > inode->i_size) {
- inode->i_size = *offset;
- inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
+ i_size_write(inode, i_size_read(inode) + total);
+ inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
}
if (n < 0)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 06a223d50a81..5947628aefef 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,6 +40,7 @@
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "fid.h"
+#include "cache.h"
static const struct inode_operations v9fs_dir_inode_operations;
static const struct inode_operations v9fs_dir_inode_operations_ext;
@@ -197,6 +198,39 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
wstat->extension = NULL;
}
+#ifdef CONFIG_9P_FSCACHE
+/**
+ * v9fs_alloc_inode - helper function to allocate an inode
+ * This callback is executed before setting up the inode so that we
+ * can associate a vcookie with each inode.
+ *
+ */
+
+struct inode *v9fs_alloc_inode(struct super_block *sb)
+{
+ struct v9fs_cookie *vcookie;
+ vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
+ GFP_KERNEL);
+ if (!vcookie)
+ return NULL;
+
+ vcookie->fscache = NULL;
+ vcookie->qid = NULL;
+ spin_lock_init(&vcookie->lock);
+ return &vcookie->inode;
+}
+
+/**
+ * v9fs_destroy_inode - destroy an inode
+ *
+ */
+
+void v9fs_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
+}
+#endif
+
/**
* v9fs_get_inode - helper function to setup an inode
* @sb: superblock
@@ -326,6 +360,21 @@ error:
}
*/
+
+/**
+ * v9fs_clear_inode - release an inode
+ * @inode: inode to release
+ *
+ */
+void v9fs_clear_inode(struct inode *inode)
+{
+ filemap_fdatawrite(inode->i_mapping);
+
+#ifdef CONFIG_9P_FSCACHE
+ v9fs_cache_inode_put_cookie(inode);
+#endif
+}
+
/**
* v9fs_inode_from_fid - populate an inode by issuing a attribute request
* @v9ses: session information
@@ -356,8 +405,14 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
v9fs_stat2inode(st, ret, sb);
ret->i_ino = v9fs_qid2ino(&st->qid);
+
+#ifdef CONFIG_9P_FSCACHE
+ v9fs_vcookie_set_qid(ret, &st->qid);
+ v9fs_cache_inode_get_cookie(ret);
+#endif
p9stat_free(st);
kfree(st);
+
return ret;
error:
@@ -751,7 +806,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
err = -EPERM;
v9ses = v9fs_inode2v9ses(dentry->d_inode);
- if (v9ses->cache == CACHE_LOOSE)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
return simple_getattr(mnt, dentry, stat);
fid = v9fs_fid_lookup(dentry);
@@ -872,10 +927,10 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
} else
inode->i_rdev = 0;
- inode->i_size = stat->length;
+ i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
- inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
+ inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
}
/**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8961f1a8f668..14a86448572c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,21 +44,9 @@
#include "v9fs_vfs.h"
#include "fid.h"
-static void v9fs_clear_inode(struct inode *);
static const struct super_operations v9fs_super_ops;
/**
- * v9fs_clear_inode - release an inode
- * @inode: inode to release
- *
- */
-
-static void v9fs_clear_inode(struct inode *inode)
-{
- filemap_fdatawrite(inode->i_mapping);
-}
-
-/**
* v9fs_set_super - set the superblock
* @s: super block
* @data: file system specific data
@@ -220,6 +208,10 @@ v9fs_umount_begin(struct super_block *sb)
}
static const struct super_operations v9fs_super_ops = {
+#ifdef CONFIG_9P_FSCACHE
+ .alloc_inode = v9fs_alloc_inode,
+ .destroy_inode = v9fs_destroy_inode,
+#endif
.statfs = simple_statfs,
.clear_inode = v9fs_clear_inode,
.show_options = generic_show_options,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 798cb071d132..3f57ce4bee5d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -19,9 +19,6 @@ static int
adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
int create)
{
- if (block < 0)
- goto abort_negative;
-
if (!create) {
if (block >= inode->i_blocks)
goto abort_toobig;
@@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
/* don't support allocation of blocks yet */
return -EIO;
-abort_negative:
- adfs_error(inode->i_sb, "block %d < 0", block);
- return -EIO;
-
abort_toobig:
return 0;
}
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
deleted file mode 100644
index 5c4f6b499e90..000000000000
--- a/fs/afs/cache.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* AFS local cache management interface
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/fscache.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 106be66dafd2..6ece2a13bf71 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -18,10 +18,10 @@
#include <linux/key.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
+#include <linux/fscache.h>
#include "afs.h"
#include "afs_vl.h"
-#include "cache.h"
#define AFS_CELL_MAX_ADDRS 15
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d11c51fc2a3f..2ca7a7cafdbf 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -8,8 +8,10 @@
*
*/
+#include <linux/cred.h>
#include <linux/file.h>
#include <linux/poll.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/fs.h>
diff --git a/fs/attr.c b/fs/attr.c
index 9fe1b1bd30a8..96d394bdaddf 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,7 +18,7 @@
/* Taken over from the old code... */
/* POSIX UID/GID verification for setting inode attributes. */
-int inode_change_ok(struct inode *inode, struct iattr *attr)
+int inode_change_ok(const struct inode *inode, struct iattr *attr)
{
int retval = -EPERM;
unsigned int ia_valid = attr->ia_valid;
@@ -60,9 +60,51 @@ fine:
error:
return retval;
}
-
EXPORT_SYMBOL(inode_change_ok);
+/**
+ * inode_newsize_ok - may this inode be truncated to a given size
+ * @inode: the inode to be truncated
+ * @offset: the new size to assign to the inode
+ * @Returns: 0 on success, -ve errno on failure
+ *
+ * inode_newsize_ok will check filesystem limits and ulimits to check that the
+ * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
+ * when necessary. Caller must not proceed with inode size change if failure is
+ * returned. @inode must be a file (not directory), with appropriate
+ * permissions to allow truncate (inode_newsize_ok does NOT check these
+ * conditions).
+ *
+ * inode_newsize_ok must be called with i_mutex held.
+ */
+int inode_newsize_ok(const struct inode *inode, loff_t offset)
+{
+ if (inode->i_size < offset) {
+ unsigned long limit;
+
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && offset > limit)
+ goto out_sig;
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_big;
+ } else {
+ /*
+ * truncation of in-use swapfiles is disallowed - it would
+ * cause subsequent swapout to scribble on the now-freed
+ * blocks.
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+ }
+
+ return 0;
+out_sig:
+ send_sig(SIGXFSZ, current, 0);
+out_big:
+ return -EFBIG;
+}
+EXPORT_SYMBOL(inode_newsize_ok);
+
int inode_setattr(struct inode * inode, struct iattr * attr)
{
unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index dd376c124e71..33baf27fac78 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,12 +737,7 @@ befs_put_super(struct super_block *sb)
{
kfree(BEFS_SB(sb)->mount_opts.iocharset);
BEFS_SB(sb)->mount_opts.iocharset = NULL;
-
- if (BEFS_SB(sb)->nls) {
- unload_nls(BEFS_SB(sb)->nls);
- BEFS_SB(sb)->nls = NULL;
- }
-
+ unload_nls(BEFS_SB(sb)->nls);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 442d94fe255c..b9b3bb51b1e4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1711,42 +1711,52 @@ struct elf_note_info {
int numnote;
};
-static int fill_note_info(struct elfhdr *elf, int phdrs,
- struct elf_note_info *info,
- long signr, struct pt_regs *regs)
+static int elf_note_info_init(struct elf_note_info *info)
{
-#define NUM_NOTES 6
- struct list_head *t;
-
- info->notes = NULL;
- info->prstatus = NULL;
- info->psinfo = NULL;
- info->fpu = NULL;
-#ifdef ELF_CORE_COPY_XFPREGS
- info->xfpu = NULL;
-#endif
+ memset(info, 0, sizeof(*info));
INIT_LIST_HEAD(&info->thread_list);
- info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
- GFP_KERNEL);
+ /* Allocate space for six ELF notes */
+ info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
if (!info->notes)
return 0;
info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
if (!info->psinfo)
- return 0;
+ goto notes_free;
info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
if (!info->prstatus)
- return 0;
+ goto psinfo_free;
info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
if (!info->fpu)
- return 0;
+ goto prstatus_free;
#ifdef ELF_CORE_COPY_XFPREGS
info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
if (!info->xfpu)
- return 0;
+ goto fpu_free;
+#endif
+ return 1;
+#ifdef ELF_CORE_COPY_XFPREGS
+ fpu_free:
+ kfree(info->fpu);
#endif
+ prstatus_free:
+ kfree(info->prstatus);
+ psinfo_free:
+ kfree(info->psinfo);
+ notes_free:
+ kfree(info->notes);
+ return 0;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+ struct elf_note_info *info,
+ long signr, struct pt_regs *regs)
+{
+ struct list_head *t;
+
+ if (!elf_note_info_init(info))
+ return 0;
- info->thread_status_size = 0;
if (signr) {
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -1806,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
#endif
return 1;
-
-#undef NUM_NOTES
}
static size_t get_note_info_size(struct elf_note_info *info)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 76285471073e..38502c67987c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
}
stack_size = exec_params.stack_size;
- if (stack_size < interp_params.stack_size)
- stack_size = interp_params.stack_size;
-
if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
executable_stack = EXSTACK_ENABLE_X;
else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
executable_stack = EXSTACK_DISABLE_X;
- else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
- executable_stack = EXSTACK_ENABLE_X;
- else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
- executable_stack = EXSTACK_DISABLE_X;
else
executable_stack = EXSTACK_DEFAULT;
+ if (stack_size == 0) {
+ stack_size = interp_params.stack_size;
+ if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
+ executable_stack = EXSTACK_ENABLE_X;
+ else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
+ executable_stack = EXSTACK_DISABLE_X;
+ else
+ executable_stack = EXSTACK_DEFAULT;
+ }
+
retval = -ENOEXEC;
if (stack_size == 0)
goto error;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e92f229e3c6e..a2796651e756 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -278,8 +278,6 @@ static int decompress_exec(
ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
if (ret <= 0)
break;
- if (ret >= (unsigned long) -4096)
- break;
len -= ret;
strm.next_in = buf;
@@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
"(%d != %d)", (unsigned) r, curid, id);
goto failed;
} else if ( ! p->lib_list[id].loaded &&
- load_flat_shared_library(id, p) > (unsigned long) -4096) {
+ IS_ERR_VALUE(load_flat_shared_library(id, p))) {
printk("BINFMT_FLAT: failed to load library %d", id);
goto failed;
}
@@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm,
textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
MAP_PRIVATE|MAP_EXECUTABLE, 0);
up_write(&current->mm->mmap_sem);
- if (!textpos || textpos >= (unsigned long) -4096) {
+ if (!textpos || IS_ERR_VALUE(textpos)) {
if (!textpos)
textpos = (unsigned long) -ENOMEM;
printk("Unable to mmap process text, errno %d\n", (int)-textpos);
@@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
up_write(&current->mm->mmap_sem);
- if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
+ if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
if (!realdatastart)
realdatastart = (unsigned long) -ENOMEM;
printk("Unable to allocate RAM for process data, errno %d\n",
@@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm,
result = bprm->file->f_op->read(bprm->file, (char *) datapos,
data_len + (relocs * sizeof(unsigned long)), &fpos);
}
- if (result >= (unsigned long)-4096) {
+ if (IS_ERR_VALUE(result)) {
printk("Unable to read data+bss, errno %d\n", (int)-result);
do_munmap(current->mm, textpos, text_len);
do_munmap(current->mm, realdatastart, data_len + extra);
@@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
up_write(&current->mm->mmap_sem);
- if (!textpos || textpos >= (unsigned long) -4096) {
+ if (!textpos || IS_ERR_VALUE(textpos)) {
if (!textpos)
textpos = (unsigned long) -ENOMEM;
printk("Unable to allocate RAM for process text/data, errno %d\n",
@@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm,
fpos = 0;
result = bprm->file->f_op->read(bprm->file,
(char *) textpos, text_len, &fpos);
- if (result < (unsigned long) -4096)
+ if (!IS_ERR_VALUE(result))
result = decompress_exec(bprm, text_len, (char *) datapos,
data_len + (relocs * sizeof(unsigned long)), 0);
}
@@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm,
fpos = 0;
result = bprm->file->f_op->read(bprm->file,
(char *) textpos, text_len, &fpos);
- if (result < (unsigned long) -4096) {
+ if (!IS_ERR_VALUE(result)) {
fpos = ntohl(hdr->data_start);
result = bprm->file->f_op->read(bprm->file, (char *) datapos,
data_len + (relocs * sizeof(unsigned long)), &fpos);
}
}
- if (result >= (unsigned long)-4096) {
+ if (IS_ERR_VALUE(result)) {
printk("Unable to read code+data+bss, errno %d\n",(int)-result);
do_munmap(current->mm, textpos, text_len + data_len + extra +
MAX_SHARED_LIBS * sizeof(unsigned long));
@@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
res = prepare_binprm(&bprm);
- if (res <= (unsigned long)-4096)
+ if (!IS_ERR_VALUE(res))
res = load_flat_file(&bprm, libs, id, NULL);
abort_creds(bprm.cred);
@@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */
res = load_flat_file(bprm, &libinfo, 0, &stack_len);
- if (res > (unsigned long)-4096)
+ if (IS_ERR_VALUE(res))
return res;
/* Update data segment pointers for all libraries */
diff --git a/fs/bio.c b/fs/bio.c
index 76738005c8e8..402cb84a92a1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
mempool_free(p, bs->bio_pool);
}
+EXPORT_SYMBOL(bio_free);
void bio_init(struct bio *bio)
{
@@ -257,6 +258,7 @@ void bio_init(struct bio *bio)
bio->bi_comp_cpu = -1;
atomic_set(&bio->bi_cnt, 1);
}
+EXPORT_SYMBOL(bio_init);
/**
* bio_alloc_bioset - allocate a bio for I/O
@@ -311,6 +313,7 @@ err_free:
mempool_free(p, bs->bio_pool);
return NULL;
}
+EXPORT_SYMBOL(bio_alloc_bioset);
static void bio_fs_destructor(struct bio *bio)
{
@@ -337,6 +340,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
return bio;
}
+EXPORT_SYMBOL(bio_alloc);
static void bio_kmalloc_destructor(struct bio *bio)
{
@@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
return bio;
}
+EXPORT_SYMBOL(bio_kmalloc);
void zero_fill_bio(struct bio *bio)
{
@@ -416,6 +421,7 @@ void bio_put(struct bio *bio)
bio->bi_destructor(bio);
}
}
+EXPORT_SYMBOL(bio_put);
inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
{
@@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
return bio->bi_phys_segments;
}
+EXPORT_SYMBOL(bio_phys_segments);
/**
* __bio_clone - clone a bio
@@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
bio->bi_size = bio_src->bi_size;
bio->bi_idx = bio_src->bi_idx;
}
+EXPORT_SYMBOL(__bio_clone);
/**
* bio_clone - clone a bio
@@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
return b;
}
+EXPORT_SYMBOL(bio_clone);
/**
* bio_get_nr_vecs - return approx number of vecs
@@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev)
return nr_pages;
}
+EXPORT_SYMBOL(bio_get_nr_vecs);
static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
*page, unsigned int len, unsigned int offset,
@@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
return __bio_add_page(q, bio, page, len, offset,
queue_max_hw_sectors(q));
}
+EXPORT_SYMBOL(bio_add_pc_page);
/**
* bio_add_page - attempt to add page to bio
@@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
}
+EXPORT_SYMBOL(bio_add_page);
struct bio_map_data {
struct bio_vec *iovecs;
@@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio)
bio_put(bio);
return ret;
}
+EXPORT_SYMBOL(bio_uncopy_user);
/**
* bio_copy_user_iov - copy user data to bio
@@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
}
+EXPORT_SYMBOL(bio_copy_user);
static struct bio *__bio_map_user_iov(struct request_queue *q,
struct block_device *bdev,
@@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
}
+EXPORT_SYMBOL(bio_map_user);
/**
* bio_map_user_iov - map user sg_iovec table into bio
@@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio)
__bio_unmap_user(bio);
bio_put(bio);
}
+EXPORT_SYMBOL(bio_unmap_user);
static void bio_map_kern_endio(struct bio *bio, int err)
{
bio_put(bio);
}
-
static struct bio *__bio_map_kern(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask)
{
@@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
bio_put(bio);
return ERR_PTR(-EINVAL);
}
+EXPORT_SYMBOL(bio_map_kern);
static void bio_copy_kern_endio(struct bio *bio, int err)
{
@@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
return bio;
}
+EXPORT_SYMBOL(bio_copy_kern);
/*
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error)
if (bio->bi_end_io)
bio->bi_end_io(bio, error);
}
+EXPORT_SYMBOL(bio_endio);
void bio_pair_release(struct bio_pair *bp)
{
@@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp)
mempool_free(bp, bp->bio2.bi_private);
}
}
+EXPORT_SYMBOL(bio_pair_release);
static void bio_pair_end_1(struct bio *bi, int err)
{
@@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
return bp;
}
+EXPORT_SYMBOL(bio_split);
/**
* bio_sector_offset - Find hardware sector offset in bio
@@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs)
kfree(bs);
}
+EXPORT_SYMBOL(bioset_free);
/**
* bioset_create - Create a bio_set
@@ -1592,6 +1613,7 @@ bad:
bioset_free(bs);
return NULL;
}
+EXPORT_SYMBOL(bioset_create);
static void __init biovec_init_slabs(void)
{
@@ -1636,29 +1658,4 @@ static int __init init_bio(void)
return 0;
}
-
subsys_initcall(init_bio);
-
-EXPORT_SYMBOL(bio_alloc);
-EXPORT_SYMBOL(bio_kmalloc);
-EXPORT_SYMBOL(bio_put);
-EXPORT_SYMBOL(bio_free);
-EXPORT_SYMBOL(bio_endio);
-EXPORT_SYMBOL(bio_init);
-EXPORT_SYMBOL(__bio_clone);
-EXPORT_SYMBOL(bio_clone);
-EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_add_page);
-EXPORT_SYMBOL(bio_add_pc_page);
-EXPORT_SYMBOL(bio_get_nr_vecs);
-EXPORT_SYMBOL(bio_map_user);
-EXPORT_SYMBOL(bio_unmap_user);
-EXPORT_SYMBOL(bio_map_kern);
-EXPORT_SYMBOL(bio_copy_kern);
-EXPORT_SYMBOL(bio_pair_release);
-EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_copy_user);
-EXPORT_SYMBOL(bio_uncopy_user);
-EXPORT_SYMBOL(bioset_create);
-EXPORT_SYMBOL(bioset_free);
-EXPORT_SYMBOL(bio_alloc_bioset);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5d1ed50bd46c..9cf4b926f8e4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev);
* freeze_bdev -- lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
* The reference counter (bd_fsfreeze_count) guarantees that only the last
@@ -232,46 +230,55 @@ struct super_block *freeze_bdev(struct block_device *bdev)
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- bdev->bd_fsfreeze_count++;
+ if (++bdev->bd_fsfreeze_count > 1) {
+ /*
+ * We don't even need to grab a reference - the first call
+ * to freeze_bdev grab an active reference and only the last
+ * thaw_bdev drops it.
+ */
sb = get_super(bdev);
+ drop_super(sb);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return sb;
}
- bdev->bd_fsfreeze_count++;
-
- down(&bdev->bd_mount_sem);
- sb = get_super(bdev);
- if (sb && !(sb->s_flags & MS_RDONLY)) {
- sb->s_frozen = SB_FREEZE_WRITE;
- smp_wmb();
-
- sync_filesystem(sb);
-
- sb->s_frozen = SB_FREEZE_TRANS;
- smp_wmb();
-
- sync_blockdev(sb->s_bdev);
-
- if (sb->s_op->freeze_fs) {
- error = sb->s_op->freeze_fs(sb);
- if (error) {
- printk(KERN_ERR
- "VFS:Filesystem freeze failed\n");
- sb->s_frozen = SB_UNFROZEN;
- drop_super(sb);
- up(&bdev->bd_mount_sem);
- bdev->bd_fsfreeze_count--;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return ERR_PTR(error);
- }
+
+ sb = get_active_super(bdev);
+ if (!sb)
+ goto out;
+ if (sb->s_flags & MS_RDONLY) {
+ deactivate_locked_super(sb);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return sb;
+ }
+
+ sb->s_frozen = SB_FREEZE_WRITE;
+ smp_wmb();
+
+ sync_filesystem(sb);
+
+ sb->s_frozen = SB_FREEZE_TRANS;
+ smp_wmb();
+
+ sync_blockdev(sb->s_bdev);
+
+ if (sb->s_op->freeze_fs) {
+ error = sb->s_op->freeze_fs(sb);
+ if (error) {
+ printk(KERN_ERR
+ "VFS:Filesystem freeze failed\n");
+ sb->s_frozen = SB_UNFROZEN;
+ deactivate_locked_super(sb);
+ bdev->bd_fsfreeze_count--;
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return ERR_PTR(error);
}
}
+ up_write(&sb->s_umount);
+ out:
sync_blockdev(bdev);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
- return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
+ return sb; /* thaw_bdev releases s->s_umount */
}
EXPORT_SYMBOL(freeze_bdev);
@@ -284,44 +291,44 @@ EXPORT_SYMBOL(freeze_bdev);
*/
int thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
- int error = 0;
+ int error = -EINVAL;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (!bdev->bd_fsfreeze_count) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return -EINVAL;
- }
-
- bdev->bd_fsfreeze_count--;
- if (bdev->bd_fsfreeze_count > 0) {
- if (sb)
- drop_super(sb);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return 0;
- }
-
- if (sb) {
- BUG_ON(sb->s_bdev != bdev);
- if (!(sb->s_flags & MS_RDONLY)) {
- if (sb->s_op->unfreeze_fs) {
- error = sb->s_op->unfreeze_fs(sb);
- if (error) {
- printk(KERN_ERR
- "VFS:Filesystem thaw failed\n");
- sb->s_frozen = SB_FREEZE_TRANS;
- bdev->bd_fsfreeze_count++;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return error;
- }
- }
- sb->s_frozen = SB_UNFROZEN;
- smp_wmb();
- wake_up(&sb->s_wait_unfrozen);
+ if (!bdev->bd_fsfreeze_count)
+ goto out_unlock;
+
+ error = 0;
+ if (--bdev->bd_fsfreeze_count > 0)
+ goto out_unlock;
+
+ if (!sb)
+ goto out_unlock;
+
+ BUG_ON(sb->s_bdev != bdev);
+ down_write(&sb->s_umount);
+ if (sb->s_flags & MS_RDONLY)
+ goto out_deactivate;
+
+ if (sb->s_op->unfreeze_fs) {
+ error = sb->s_op->unfreeze_fs(sb);
+ if (error) {
+ printk(KERN_ERR
+ "VFS:Filesystem thaw failed\n");
+ sb->s_frozen = SB_FREEZE_TRANS;
+ bdev->bd_fsfreeze_count++;
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return error;
}
- drop_super(sb);
}
- up(&bdev->bd_mount_sem);
+ sb->s_frozen = SB_UNFROZEN;
+ smp_wmb();
+ wake_up(&sb->s_wait_unfrozen);
+
+out_deactivate:
+ if (sb)
+ deactivate_locked_super(sb);
+out_unlock:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return 0;
}
@@ -430,7 +437,6 @@ static void init_once(void *foo)
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- sema_init(&bdev->bd_mount_sem, 1);
INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..361604244271 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
#include "btrfs_inode.h"
#include "xattr.h"
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
.set = btrfs_xattr_acl_access_set,
};
-#else /* CONFIG_FS_POSIX_ACL */
+#else /* CONFIG_BTRFS_FS_POSIX_ACL */
int btrfs_acl_chmod(struct inode *inode)
{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
return 0;
}
-#endif /* CONFIG_FS_POSIX_ACL */
+#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 019e8af449ab..c0861e781cdb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
/* number of things on the pending list */
atomic_t num_pending;
+ /* reference counter for this struct */
+ atomic_t refs;
+
unsigned long sequence;
/* protects the pending list. */
@@ -61,6 +64,51 @@ struct btrfs_worker_thread {
};
/*
+ * btrfs_start_workers uses kthread_run, which can block waiting for memory
+ * for a very long time. It will actually throttle on page writeback,
+ * and so it may not make progress until after our btrfs worker threads
+ * process all of the pending work structs in their queue
+ *
+ * This means we can't use btrfs_start_workers from inside a btrfs worker
+ * thread that is used as part of cleaning dirty memory, which pretty much
+ * involves all of the worker threads.
+ *
+ * Instead we have a helper queue who never has more than one thread
+ * where we scheduler thread start operations. This worker_start struct
+ * is used to contain the work and hold a pointer to the queue that needs
+ * another worker.
+ */
+struct worker_start {
+ struct btrfs_work work;
+ struct btrfs_workers *queue;
+};
+
+static void start_new_worker_func(struct btrfs_work *work)
+{
+ struct worker_start *start;
+ start = container_of(work, struct worker_start, work);
+ btrfs_start_workers(start->queue, 1);
+ kfree(start);
+}
+
+static int start_new_worker(struct btrfs_workers *queue)
+{
+ struct worker_start *start;
+ int ret;
+
+ start = kzalloc(sizeof(*start), GFP_NOFS);
+ if (!start)
+ return -ENOMEM;
+
+ start->work.func = start_new_worker_func;
+ start->queue = queue;
+ ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
+ if (ret)
+ kfree(start);
+ return ret;
+}
+
+/*
* helper function to move a thread onto the idle list after it
* has finished some requests.
*/
@@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
unsigned long flags;
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 1;
- list_move(&worker->worker_list, &worker->workers->idle_list);
+
+ /* the list may be empty if the worker is just starting */
+ if (!list_empty(&worker->worker_list)) {
+ list_move(&worker->worker_list,
+ &worker->workers->idle_list);
+ }
spin_unlock_irqrestore(&worker->workers->lock, flags);
}
}
@@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
unsigned long flags;
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 0;
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
+
+ if (!list_empty(&worker->worker_list)) {
+ list_move_tail(&worker->worker_list,
+ &worker->workers->worker_list);
+ }
spin_unlock_irqrestore(&worker->workers->lock, flags);
}
}
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
- struct btrfs_work *work)
+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
{
+ struct btrfs_workers *workers = worker->workers;
unsigned long flags;
+ rmb();
+ if (!workers->atomic_start_pending)
+ return;
+
+ spin_lock_irqsave(&workers->lock, flags);
+ if (!workers->atomic_start_pending)
+ goto out;
+
+ workers->atomic_start_pending = 0;
+ if (workers->num_workers + workers->num_workers_starting >=
+ workers->max_workers)
+ goto out;
+
+ workers->num_workers_starting += 1;
+ spin_unlock_irqrestore(&workers->lock, flags);
+ start_new_worker(workers);
+ return;
+
+out:
+ spin_unlock_irqrestore(&workers->lock, flags);
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+ struct btrfs_work *work)
+{
if (!workers->ordered)
return 0;
set_bit(WORK_DONE_BIT, &work->flags);
- spin_lock_irqsave(&workers->lock, flags);
+ spin_lock(&workers->order_lock);
while (1) {
if (!list_empty(&workers->prio_order_list)) {
@@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
break;
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
work->ordered_func(work);
/* now take the lock again and call the freeing code */
- spin_lock_irqsave(&workers->lock, flags);
+ spin_lock(&workers->order_lock);
list_del(&work->order_list);
work->ordered_free(work);
}
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
return 0;
}
+static void put_worker(struct btrfs_worker_thread *worker)
+{
+ if (atomic_dec_and_test(&worker->refs))
+ kfree(worker);
+}
+
+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
+{
+ int freeit = 0;
+
+ spin_lock_irq(&worker->lock);
+ spin_lock(&worker->workers->lock);
+ if (worker->workers->num_workers > 1 &&
+ worker->idle &&
+ !worker->working &&
+ !list_empty(&worker->worker_list) &&
+ list_empty(&worker->prio_pending) &&
+ list_empty(&worker->pending) &&
+ atomic_read(&worker->num_pending) == 0) {
+ freeit = 1;
+ list_del_init(&worker->worker_list);
+ worker->workers->num_workers--;
+ }
+ spin_unlock(&worker->workers->lock);
+ spin_unlock_irq(&worker->lock);
+
+ if (freeit)
+ put_worker(worker);
+ return freeit;
+}
+
+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+ struct list_head *prio_head,
+ struct list_head *head)
+{
+ struct btrfs_work *work = NULL;
+ struct list_head *cur = NULL;
+
+ if(!list_empty(prio_head))
+ cur = prio_head->next;
+
+ smp_mb();
+ if (!list_empty(&worker->prio_pending))
+ goto refill;
+
+ if (!list_empty(head))
+ cur = head->next;
+
+ if (cur)
+ goto out;
+
+refill:
+ spin_lock_irq(&worker->lock);
+ list_splice_tail_init(&worker->prio_pending, prio_head);
+ list_splice_tail_init(&worker->pending, head);
+
+ if (!list_empty(prio_head))
+ cur = prio_head->next;
+ else if (!list_empty(head))
+ cur = head->next;
+ spin_unlock_irq(&worker->lock);
+
+ if (!cur)
+ goto out_fail;
+
+out:
+ work = list_entry(cur, struct btrfs_work, list);
+
+out_fail:
+ return work;
+}
+
/*
* main loop for servicing work items
*/
static int worker_loop(void *arg)
{
struct btrfs_worker_thread *worker = arg;
- struct list_head *cur;
+ struct list_head head;
+ struct list_head prio_head;
struct btrfs_work *work;
+
+ INIT_LIST_HEAD(&head);
+ INIT_LIST_HEAD(&prio_head);
+
do {
- spin_lock_irq(&worker->lock);
-again_locked:
+again:
while (1) {
- if (!list_empty(&worker->prio_pending))
- cur = worker->prio_pending.next;
- else if (!list_empty(&worker->pending))
- cur = worker->pending.next;
- else
+
+
+ work = get_next_work(worker, &prio_head, &head);
+ if (!work)
break;
- work = list_entry(cur, struct btrfs_work, list);
list_del(&work->list);
clear_bit(WORK_QUEUED_BIT, &work->flags);
work->worker = worker;
- spin_unlock_irq(&worker->lock);
work->func(work);
@@ -175,9 +329,13 @@ again_locked:
*/
run_ordered_completions(worker->workers, work);
- spin_lock_irq(&worker->lock);
- check_idle_worker(worker);
+ check_pending_worker_creates(worker);
+
}
+
+ spin_lock_irq(&worker->lock);
+ check_idle_worker(worker);
+
if (freezing(current)) {
worker->working = 0;
spin_unlock_irq(&worker->lock);
@@ -216,8 +374,10 @@ again_locked:
spin_lock_irq(&worker->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- goto again_locked;
+ !list_empty(&worker->prio_pending)) {
+ spin_unlock_irq(&worker->lock);
+ goto again;
+ }
/*
* this makes sure we get a wakeup when someone
@@ -226,8 +386,13 @@ again_locked:
worker->working = 0;
spin_unlock_irq(&worker->lock);
- if (!kthread_should_stop())
- schedule();
+ if (!kthread_should_stop()) {
+ schedule_timeout(HZ * 120);
+ if (!worker->working &&
+ try_worker_shutdown(worker)) {
+ return 0;
+ }
+ }
}
__set_current_state(TASK_RUNNING);
}
@@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
{
struct list_head *cur;
struct btrfs_worker_thread *worker;
+ int can_stop;
+ spin_lock_irq(&workers->lock);
list_splice_init(&workers->idle_list, &workers->worker_list);
while (!list_empty(&workers->worker_list)) {
cur = workers->worker_list.next;
worker = list_entry(cur, struct btrfs_worker_thread,
worker_list);
- kthread_stop(worker->task);
- list_del(&worker->worker_list);
- kfree(worker);
+
+ atomic_inc(&worker->refs);
+ workers->num_workers -= 1;
+ if (!list_empty(&worker->worker_list)) {
+ list_del_init(&worker->worker_list);
+ put_worker(worker);
+ can_stop = 1;
+ } else
+ can_stop = 0;
+ spin_unlock_irq(&workers->lock);
+ if (can_stop)
+ kthread_stop(worker->task);
+ spin_lock_irq(&workers->lock);
+ put_worker(worker);
}
+ spin_unlock_irq(&workers->lock);
return 0;
}
/*
* simple init on struct btrfs_workers
*/
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+ struct btrfs_workers *async_helper)
{
workers->num_workers = 0;
+ workers->num_workers_starting = 0;
INIT_LIST_HEAD(&workers->worker_list);
INIT_LIST_HEAD(&workers->idle_list);
INIT_LIST_HEAD(&workers->order_list);
INIT_LIST_HEAD(&workers->prio_order_list);
spin_lock_init(&workers->lock);
+ spin_lock_init(&workers->order_lock);
workers->max_workers = max;
workers->idle_thresh = 32;
workers->name = name;
workers->ordered = 0;
+ workers->atomic_start_pending = 0;
+ workers->atomic_worker_start = async_helper;
}
/*
* starts new worker threads. This does not enforce the max worker
* count in case you need to temporarily go past it.
*/
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers,
+ int num_workers)
{
struct btrfs_worker_thread *worker;
int ret = 0;
@@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock);
+
atomic_set(&worker->num_pending, 0);
+ atomic_set(&worker->refs, 1);
worker->workers = workers;
worker->task = kthread_run(worker_loop, worker,
"btrfs-%s-%d", workers->name,
@@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
kfree(worker);
goto fail;
}
-
spin_lock_irq(&workers->lock);
list_add_tail(&worker->worker_list, &workers->idle_list);
worker->idle = 1;
workers->num_workers++;
+ workers->num_workers_starting--;
+ WARN_ON(workers->num_workers_starting < 0);
spin_unlock_irq(&workers->lock);
}
return 0;
@@ -316,6 +504,14 @@ fail:
return ret;
}
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+ spin_lock_irq(&workers->lock);
+ workers->num_workers_starting += num_workers;
+ spin_unlock_irq(&workers->lock);
+ return __btrfs_start_workers(workers, num_workers);
+}
+
/*
* run through the list and find a worker thread that doesn't have a lot
* to do right now. This can return null if we aren't yet at the thread
@@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
struct list_head *next;
- int enforce_min = workers->num_workers < workers->max_workers;
+ int enforce_min;
+
+ enforce_min = (workers->num_workers + workers->num_workers_starting) <
+ workers->max_workers;
/*
* if we find an idle thread, don't move it to the end of the
@@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
*/
next = workers->worker_list.next;
worker = list_entry(next, struct btrfs_worker_thread, worker_list);
- atomic_inc(&worker->num_pending);
worker->sequence++;
if (worker->sequence % workers->idle_thresh == 0)
@@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
+ struct list_head *fallback;
again:
spin_lock_irqsave(&workers->lock, flags);
worker = next_worker(workers);
- spin_unlock_irqrestore(&workers->lock, flags);
if (!worker) {
- spin_lock_irqsave(&workers->lock, flags);
- if (workers->num_workers >= workers->max_workers) {
- struct list_head *fallback = NULL;
- /*
- * we have failed to find any workers, just
- * return the force one
- */
- if (!list_empty(&workers->worker_list))
- fallback = workers->worker_list.next;
- if (!list_empty(&workers->idle_list))
- fallback = workers->idle_list.next;
- BUG_ON(!fallback);
- worker = list_entry(fallback,
- struct btrfs_worker_thread, worker_list);
- spin_unlock_irqrestore(&workers->lock, flags);
+ if (workers->num_workers + workers->num_workers_starting >=
+ workers->max_workers) {
+ goto fallback;
+ } else if (workers->atomic_worker_start) {
+ workers->atomic_start_pending = 1;
+ goto fallback;
} else {
+ workers->num_workers_starting++;
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
- btrfs_start_workers(workers, 1);
+ __btrfs_start_workers(workers, 1);
goto again;
}
}
+ goto found;
+
+fallback:
+ fallback = NULL;
+ /*
+ * we have failed to find any workers, just
+ * return the first one we can find.
+ */
+ if (!list_empty(&workers->worker_list))
+ fallback = workers->worker_list.next;
+ if (!list_empty(&workers->idle_list))
+ fallback = workers->idle_list.next;
+ BUG_ON(!fallback);
+ worker = list_entry(fallback,
+ struct btrfs_worker_thread, worker_list);
+found:
+ /*
+ * this makes sure the worker doesn't exit before it is placed
+ * onto a busy/idle list
+ */
+ atomic_inc(&worker->num_pending);
+ spin_unlock_irqrestore(&workers->lock, flags);
return worker;
}
@@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
spin_lock(&worker->workers->lock);
worker->idle = 0;
list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
+ &worker->workers->worker_list);
spin_unlock(&worker->workers->lock);
}
if (!worker->working) {
@@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
worker->working = 1;
}
- spin_unlock_irqrestore(&worker->lock, flags);
if (wake)
wake_up_process(worker->task);
+ spin_unlock_irqrestore(&worker->lock, flags);
out:
return 0;
@@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
worker = find_worker(workers);
if (workers->ordered) {
- spin_lock_irqsave(&workers->lock, flags);
+ /*
+ * you're not allowed to do ordered queues from an
+ * interrupt handler
+ */
+ spin_lock(&workers->order_lock);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
list_add_tail(&work->order_list,
&workers->prio_order_list);
} else {
list_add_tail(&work->order_list, &workers->order_list);
}
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
} else {
INIT_LIST_HEAD(&work->order_list);
}
@@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
list_add_tail(&work->list, &worker->prio_pending);
else
list_add_tail(&work->list, &worker->pending);
- atomic_inc(&worker->num_pending);
check_busy_worker(worker);
/*
@@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
wake = 1;
worker->working = 1;
- spin_unlock_irqrestore(&worker->lock, flags);
-
if (wake)
wake_up_process(worker->task);
+ spin_unlock_irqrestore(&worker->lock, flags);
+
out:
return 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1b511c109db6..5077746cf85e 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
/* current number of running workers */
int num_workers;
+ int num_workers_starting;
+
/* max number of workers allowed. changed by btrfs_start_workers */
int max_workers;
@@ -73,6 +75,16 @@ struct btrfs_workers {
/* force completions in the order they were queued */
int ordered;
+ /* more workers required, but in an interrupt handler */
+ int atomic_start_pending;
+
+ /*
+ * are we allowed to sleep while starting workers or are we required
+ * to start them at a later time? If we can't sleep, this indicates
+ * which queue we need to use to schedule thread creation.
+ */
+ struct btrfs_workers *atomic_worker_start;
+
/* list with all the work threads. The workers on the idle thread
* may be actively servicing jobs, but they haven't yet hit the
* idle thresh limit above.
@@ -90,6 +102,9 @@ struct btrfs_workers {
/* lock for finding the next worker thread to queue on */
spinlock_t lock;
+ /* lock for the ordered lists */
+ spinlock_t order_lock;
+
/* extra name for this worker, used for current->name */
char *name;
};
@@ -97,7 +112,8 @@ struct btrfs_workers {
int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
int btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+ struct btrfs_workers *async_starter);
int btrfs_requeue_work(struct btrfs_work *work);
void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ea1ea0af8c0e..f6783a42f010 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,6 +86,12 @@ struct btrfs_inode {
* transid of the trans_handle that last modified this inode
*/
u64 last_trans;
+
+ /*
+ * log transid when this inode was last modified
+ */
+ u64 last_sub_trans;
+
/*
* transid that last logged this inode
*/
@@ -128,6 +134,16 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for.
+ */
+ spinlock_t accounting_lock;
+ int reserved_extents;
+ int outstanding_extents;
+
+ /*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
@@ -138,6 +154,7 @@ struct btrfs_inode {
* of these.
*/
unsigned ordered_data_close:1;
+ unsigned dummy_inode:1;
struct inode vfs_inode;
};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d8ba4d54a37..a11a32058b50 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
set_page_extent_mapped(page);
lock_extent(tree, last_offset, end, GFP_NOFS);
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em || last_offset < em->start ||
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
page_offset(bio->bi_io_vec->bv_page),
PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3fdcc0512d3a..ec96f3a6d536 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int split;
int num_doubles = 0;
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+ return -EOVERFLOW;
+
/* first try to make some room by pushing left and right */
if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
wret = push_leaf_right(trans, root, path, data_size, 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 837435ce84ca..444b3e9b92a4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
@@ -670,21 +674,29 @@ struct btrfs_space_info {
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
-
- /* delalloc accounting */
- u64 bytes_delalloc; /* number of bytes reserved for allocation,
- this space is not necessarily reserved yet
- by the allocator */
+ u64 bytes_super; /* total bytes reserved for the super blocks */
+ u64 bytes_root; /* the number of bytes needed to commit a
+ transaction */
u64 bytes_may_use; /* number of bytes that may be used for
- delalloc */
+ delalloc/allocations */
+ u64 bytes_delalloc; /* number of bytes currently reserved for
+ delayed allocation */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
+ int force_delalloc; /* make people start doing filemap_flush until
+ we're under a threshold */
struct list_head list;
+ /* for controlling how we free up space for allocations */
+ wait_queue_head_t allocate_wait;
+ wait_queue_head_t flush_wait;
+ int allocating_chunk;
+ int flushing;
+
/* for block groups in our same type */
struct list_head block_groups;
spinlock_t lock;
@@ -726,6 +738,15 @@ enum btrfs_caching_type {
BTRFS_CACHE_FINISHED = 2,
};
+struct btrfs_caching_control {
+ struct list_head list;
+ struct mutex mutex;
+ wait_queue_head_t wait;
+ struct btrfs_block_group_cache *block_group;
+ u64 progress;
+ atomic_t count;
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -733,6 +754,7 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 bytes_super;
u64 flags;
u64 sectorsize;
int extents_thresh;
@@ -742,8 +764,9 @@ struct btrfs_block_group_cache {
int dirty;
/* cache tracking stuff */
- wait_queue_head_t caching_q;
int cached;
+ struct btrfs_caching_control *caching_ctl;
+ u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
@@ -782,13 +805,16 @@ struct btrfs_fs_info {
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
+
+ spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
- struct extent_io_tree pinned_extents;
+ struct extent_io_tree freed_extents[2];
+ struct extent_io_tree *pinned_extents;
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
@@ -822,11 +848,7 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
- struct mutex drop_mutex;
struct mutex volume_mutex;
- struct mutex tree_reloc_mutex;
- struct rw_semaphore extent_commit_sem;
-
/*
* this protects the ordered operations list only while we are
* processing all of the entries on it. This way we make
@@ -835,10 +857,16 @@ struct btrfs_fs_info {
* before jumping into the main commit.
*/
struct mutex ordered_operations_mutex;
+ struct rw_semaphore extent_commit_sem;
+
+ struct rw_semaphore subvol_sem;
+
+ struct srcu_struct subvol_srcu;
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
+ struct list_head caching_block_groups;
atomic_t nr_async_submits;
atomic_t async_submit_draining;
@@ -882,6 +910,7 @@ struct btrfs_fs_info {
* A third pool does submit_bio to avoid deadlocking with the other
* two
*/
+ struct btrfs_workers generic_worker;
struct btrfs_workers workers;
struct btrfs_workers delalloc_workers;
struct btrfs_workers endio_workers;
@@ -889,6 +918,7 @@ struct btrfs_fs_info {
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
+ struct btrfs_workers enospc_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -979,7 +1009,10 @@ struct btrfs_root {
atomic_t log_writers;
atomic_t log_commit[2];
unsigned long log_transid;
+ unsigned long last_log_commit;
unsigned long log_batch;
+ pid_t log_start_pid;
+ bool log_multiple_pids;
u64 objectid;
u64 last_trans;
@@ -996,10 +1029,12 @@ struct btrfs_root {
u32 stripesize;
u32 type;
- u64 highest_inode;
- u64 last_inode_alloc;
+
+ u64 highest_objectid;
int ref_cows;
int track_dirty;
+ int in_radix;
+
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1118,6 +1153,7 @@ struct btrfs_root {
#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
#define BTRFS_MOUNT_NOSSD (1 << 9)
+#define BTRFS_MOUNT_DISCARD (1 << 10)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_pinned_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int pin);
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserved);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_io_tree *unpin);
+ struct btrfs_root *root);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
-void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent);
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
- struct btrfs_path *path,
- u64 root_id, u64 ref_id);
+ struct btrfs_path *path,
+ u64 root_id, u64 ref_id);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id,
- u64 dirid, u64 sequence,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
const char *name, int name_len);
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key);
@@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
int btrfs_search_root(struct btrfs_root *root, u64 search_start,
u64 *found_objectid);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
int btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
/* dir-item.c */
@@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
u64 objectid, const char *name, int name_len,
int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
struct btrfs_path *path,
const char *name, int name_len);
@@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
/* inode-map.c */
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
@@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 new_size,
@@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root, struct dentry *dentry,
+ struct btrfs_root *new_root,
u64 new_dirid, u64 alloc_hint);
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
void btrfs_dirty_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
+void btrfs_drop_inode(struct inode *inode);
int btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
@@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_invalidate_inodes(struct btrfs_root *root);
+extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2286,11 +2344,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
-extern struct file_operations btrfs_file_operations;
+extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 locked_end,
- u64 inline_limit, u64 *hint_block);
+ u64 inline_limit, u64 *hint_block, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end);
@@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
/* acl.c */
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
int btrfs_check_acl(struct inode *inode, int mask);
#else
#define btrfs_check_acl NULL
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1d70236ba00c..f3a6075519cc 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u32 nritems;
+ int ret;
+
+ key.objectid = dirid;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+ break;
+
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di)
+ return di;
+
+ path->slots[0]++;
+ }
+ return NULL;
+}
+
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6c4173146bb7..02b6afbd7450 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
struct extent_map *em;
int ret;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em) {
em->bdev =
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
goto out;
}
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
em = alloc_extent_map(GFP_NOFS);
if (!em) {
@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
em->block_start = 0;
em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
if (ret == -EEXIST) {
u64 failed_start = em->start;
@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
free_extent_map(em);
em = NULL;
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret)
em = ERR_PTR(ret);
@@ -821,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
int btrfs_write_tree_block(struct extent_buffer *buf)
{
- return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
- buf->start + buf->len - 1, WB_SYNC_ALL);
+ return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+ buf->start + buf->len - 1);
}
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
- return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
- buf->start, buf->start + buf->len - 1);
+ return filemap_fdatawait_range(buf->first_page->mapping,
+ buf->start, buf->start + buf->len - 1);
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->fs_info = fs_info;
root->objectid = objectid;
root->last_trans = 0;
- root->highest_inode = 0;
- root->last_inode_alloc = 0;
+ root->highest_objectid = 0;
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree.rb_node = NULL;
@@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_writers, 0);
root->log_batch = 0;
root->log_transid = 0;
+ root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -952,14 +953,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
root, fs_info, objectid);
ret = btrfs_find_last_root(tree_root, objectid,
&root->root_item, &root->root_key);
+ if (ret > 0)
+ return -ENOENT;
BUG_ON(ret);
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
- root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
+ root->commit_root = btrfs_root_node(root);
return 0;
}
@@ -1085,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
root->log_transid = 0;
+ root->last_log_commit = 0;
return 0;
}
@@ -1095,7 +1099,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
struct extent_buffer *l;
- u64 highest_inode;
u64 generation;
u32 blocksize;
int ret = 0;
@@ -1110,7 +1113,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
kfree(root);
return ERR_PTR(ret);
}
- goto insert;
+ goto out;
}
__setup_root(tree_root->nodesize, tree_root->leafsize,
@@ -1120,39 +1123,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
path = btrfs_alloc_path();
BUG_ON(!path);
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
- if (ret != 0) {
- if (ret > 0)
- ret = -ENOENT;
- goto out;
+ if (ret == 0) {
+ l = path->nodes[0];
+ read_extent_buffer(l, &root->root_item,
+ btrfs_item_ptr_offset(l, path->slots[0]),
+ sizeof(root->root_item));
+ memcpy(&root->root_key, location, sizeof(*location));
}
- l = path->nodes[0];
- read_extent_buffer(l, &root->root_item,
- btrfs_item_ptr_offset(l, path->slots[0]),
- sizeof(root->root_item));
- memcpy(&root->root_key, location, sizeof(*location));
- ret = 0;
-out:
- btrfs_release_path(root, path);
btrfs_free_path(path);
if (ret) {
- kfree(root);
+ if (ret > 0)
+ ret = -ENOENT;
return ERR_PTR(ret);
}
+
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
-insert:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+out:
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
root->ref_cows = 1;
- ret = btrfs_find_highest_inode(root, &highest_inode);
- if (ret == 0) {
- root->highest_inode = highest_inode;
- root->last_inode_alloc = highest_inode;
- }
- }
+
return root;
}
@@ -1187,39 +1181,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->dev_root;
if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
return fs_info->csum_root;
-
+again:
+ spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)location->objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (root)
return root;
+ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+ if (ret == 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ return ERR_PTR(ret);
+
root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
+ WARN_ON(btrfs_root_refs(&root->root_item) == 0);
set_anon_super(&root->anon_super, NULL);
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto fail;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
+ if (ret == 0)
+ root->in_radix = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
if (ret) {
- free_extent_buffer(root->node);
- kfree(root);
- return ERR_PTR(ret);
+ if (ret == -EEXIST) {
+ free_fs_root(root);
+ goto again;
+ }
+ goto fail;
}
- if (!(fs_info->sb->s_flags & MS_RDONLY)) {
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root->root_key.objectid);
- BUG_ON(ret);
+
+ ret = btrfs_find_dead_roots(fs_info->tree_root,
+ root->root_key.objectid);
+ WARN_ON(ret);
+
+ if (!(fs_info->sb->s_flags & MS_RDONLY))
btrfs_orphan_cleanup(root);
- }
+
return root;
+fail:
+ free_fs_root(root);
+ return ERR_PTR(ret);
}
struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
const char *name, int namelen)
{
+ return btrfs_read_fs_root_no_name(fs_info, location);
+#if 0
struct btrfs_root *root;
int ret;
@@ -1236,7 +1257,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
kfree(root);
return ERR_PTR(ret);
}
-#if 0
+
ret = btrfs_sysfs_add_root(root);
if (ret) {
free_extent_buffer(root->node);
@@ -1244,9 +1265,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
kfree(root);
return ERR_PTR(ret);
}
-#endif
root->in_sysfs = 1;
return root;
+#endif
}
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
@@ -1325,9 +1346,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
offset = page_offset(page);
em_tree = &BTRFS_I(inode)->extent_tree;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em) {
__unplug_io_fn(bdi, page);
return;
@@ -1360,8 +1381,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
err = bdi_register(bdi, NULL, "btrfs-%d",
atomic_inc_return(&btrfs_bdi_num));
- if (err)
+ if (err) {
+ bdi_destroy(bdi);
return err;
+ }
bdi->ra_pages = default_backing_dev_info.ra_pages;
bdi->unplug_io_fn = btrfs_unplug_io_fn;
@@ -1451,9 +1474,12 @@ static int cleaner_kthread(void *arg)
break;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
- mutex_lock(&root->fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(root);
- mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+ mutex_trylock(&root->fs_info->cleaner_mutex)) {
+ btrfs_clean_old_snapshots(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ }
if (freezing(current)) {
refrigerator();
@@ -1558,15 +1584,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
err = -ENOMEM;
goto fail;
}
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+
+ ret = init_srcu_struct(&fs_info->subvol_srcu);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+
+ ret = setup_bdi(fs_info, &fs_info->bdi);
+ if (ret) {
+ err = ret;
+ goto fail_srcu;
+ }
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail_bdi;
+ }
+
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
INIT_LIST_HEAD(&fs_info->ordered_operations);
+ INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->new_trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
@@ -1585,12 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
- if (setup_bdi(fs_info, &fs_info->bdi))
- goto fail_bdi;
- fs_info->btree_inode = new_inode(sb);
- fs_info->btree_inode->i_ino = 1;
- fs_info->btree_inode->i_nlink = 1;
- fs_info->metadata_ratio = 8;
+ fs_info->metadata_ratio = 0;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -1602,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi;
+ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ fs_info->btree_inode->i_nlink = 1;
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
@@ -1620,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+ BTRFS_I(fs_info->btree_inode)->root = tree_root;
+ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+ sizeof(struct btrfs_key));
+ BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+ insert_inode_hash(fs_info->btree_inode);
+
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree.rb_node = NULL;
- extent_io_tree_init(&fs_info->pinned_extents,
+ extent_io_tree_init(&fs_info->freed_extents[0],
fs_info->btree_inode->i_mapping, GFP_NOFS);
+ extent_io_tree_init(&fs_info->freed_extents[1],
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
fs_info->do_barriers = 1;
- BTRFS_I(fs_info->btree_inode)->root = tree_root;
- memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
- sizeof(struct btrfs_key));
- insert_inode_hash(fs_info->btree_inode);
mutex_init(&fs_info->trans_mutex);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
- mutex_init(&fs_info->drop_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
- mutex_init(&fs_info->tree_reloc_mutex);
init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->subvol_sem);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1701,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_iput;
}
- /*
- * we need to start all the end_io workers up front because the
- * queue work function gets called at interrupt time, and so it
- * cannot dynamically grow.
- */
+ btrfs_init_workers(&fs_info->generic_worker,
+ "genwork", 1, NULL);
+
btrfs_init_workers(&fs_info->workers, "worker",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
- fs_info->thread_pool_size));
+ fs_info->thread_pool_size),
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->enospc_workers, "enospc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
@@ -1728,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->delalloc_workers.idle_thresh = 2;
fs_info->delalloc_workers.ordered = 1;
- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_workers, "endio",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_meta_write_workers,
- "endio-meta-write", fs_info->thread_pool_size);
+ "endio-meta-write", fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1745,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_workers.idle_thresh = 4;
fs_info->endio_meta_workers.idle_thresh = 4;
- fs_info->endio_write_workers.idle_thresh = 64;
- fs_info->endio_meta_write_workers.idle_thresh = 64;
+ fs_info->endio_write_workers.idle_thresh = 2;
+ fs_info->endio_meta_write_workers.idle_thresh = 2;
btrfs_start_workers(&fs_info->workers, 1);
+ btrfs_start_workers(&fs_info->generic_worker, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->delalloc_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
- btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_meta_workers,
- fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_meta_write_workers,
- fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_write_workers,
- fs_info->thread_pool_size);
+ btrfs_start_workers(&fs_info->endio_workers, 1);
+ btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+ btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+ btrfs_start_workers(&fs_info->endio_write_workers, 1);
+ btrfs_start_workers(&fs_info->enospc_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1918,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
}
+ ret = btrfs_find_orphan_roots(tree_root);
+ BUG_ON(ret);
+
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_recover_relocation(tree_root);
BUG_ON(ret);
@@ -1961,6 +2020,7 @@ fail_chunk_root:
free_extent_buffer(chunk_root->node);
free_extent_buffer(chunk_root->commit_root);
fail_sb_buffer:
+ btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@@ -1969,6 +2029,7 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->enospc_workers);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -1977,6 +2038,8 @@ fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
fail_bdi:
bdi_destroy(&fs_info->bdi);
+fail_srcu:
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
kfree(extent_root);
kfree(tree_root);
@@ -2236,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ synchronize_srcu(&fs_info->subvol_srcu);
+
+ free_fs_root(root);
+ return 0;
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
if (root->anon_super.s_dev) {
down_write(&root->anon_super.s_umount);
kill_anon_super(&root->anon_super);
}
- if (root->node)
- free_extent_buffer(root->node);
- if (root->commit_root)
- free_extent_buffer(root->commit_root);
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
kfree(root->name);
kfree(root);
- return 0;
}
static int del_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2258,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *gang[8];
int i;
+ while (!list_empty(&fs_info->dead_roots)) {
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
+
+ if (gang[0]->in_radix) {
+ btrfs_free_fs_root(fs_info, gang[0]);
+ } else {
+ free_extent_buffer(gang[0]->node);
+ free_extent_buffer(gang[0]->commit_root);
+ kfree(gang[0]);
+ }
+ }
+
while (1) {
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, 0,
@@ -2287,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
root_objectid = gang[ret - 1]->root_key.objectid + 1;
for (i = 0; i < ret; i++) {
root_objectid = gang[i]->root_key.objectid;
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root_objectid);
- BUG_ON(ret);
btrfs_orphan_cleanup(gang[i]);
}
root_objectid++;
@@ -2359,12 +2442,12 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(root->fs_info->csum_root->commit_root);
btrfs_free_block_groups(root->fs_info);
- btrfs_free_pinned_extents(root->fs_info);
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
+ btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@@ -2373,11 +2456,13 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->enospc_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
bdi_destroy(&fs_info->bdi);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9596b40caa4e..ba5c3fd5ab8c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
len = BTRFS_FID_SIZE_NON_CONNECTABLE;
type = FILEID_BTRFS_WITHOUT_PARENT;
- fid->objectid = BTRFS_I(inode)->location.objectid;
+ fid->objectid = inode->i_ino;
fid->root_objectid = BTRFS_I(inode)->root->objectid;
fid->gen = inode->i_generation;
@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
}
static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation)
+ u64 root_objectid, u32 generation,
+ int check_generation)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
struct btrfs_root *root;
+ struct dentry *dentry;
struct inode *inode;
struct btrfs_key key;
+ int index;
+ int err = 0;
+
+ if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+ return ERR_PTR(-ESTALE);
key.objectid = root_objectid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
- if (IS_ERR(root))
- return ERR_CAST(root);
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto fail;
+ }
+
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ err = -ENOENT;
+ goto fail;
+ }
key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
inode = btrfs_iget(sb, &key, root);
- if (IS_ERR(inode))
- return (void *)inode;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto fail;
+ }
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
- if (generation != inode->i_generation) {
+ if (check_generation && generation != inode->i_generation) {
iput(inode);
return ERR_PTR(-ESTALE);
}
- return d_obtain_alias(inode);
+ dentry = d_obtain_alias(inode);
+ if (!IS_ERR(dentry))
+ dentry->d_op = &btrfs_dentry_operations;
+ return dentry;
+fail:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return ERR_PTR(err);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
objectid = fid->parent_objectid;
generation = fid->parent_gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
root_objectid = fid->root_objectid;
generation = fid->gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = child->d_inode;
+ static struct dentry *dentry;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
- int slot;
- u64 objectid;
+ struct btrfs_root_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
int ret;
path = btrfs_alloc_path();
- key.objectid = dir->i_ino;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
- key.offset = (u64)-1;
+ if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = root->fs_info->tree_root;
+ } else {
+ key.objectid = dir->i_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ }
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- /* Error */
- btrfs_free_path(path);
- return ERR_PTR(ret);
+ if (ret < 0)
+ goto fail;
+
+ BUG_ON(ret == 0);
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto fail;
}
+
+ path->slots[0]--;
leaf = path->nodes[0];
- slot = path->slots[0];
- if (ret) {
- /* btrfs_search_slot() returns the slot where we'd want to
- insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
- The _real_ backref, telling us what the parent inode
- _actually_ is, will be in the slot _before_ the one
- that btrfs_search_slot() returns. */
- if (!slot) {
- /* Unless there is _no_ key in the tree before... */
- btrfs_free_path(path);
- return ERR_PTR(-EIO);
- }
- slot--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != key.objectid || found_key.type != key.type) {
+ ret = -ENOENT;
+ goto fail;
}
- btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ key.objectid = btrfs_root_ref_dirid(leaf, ref);
+ } else {
+ key.objectid = found_key.offset;
+ }
btrfs_free_path(path);
- if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
- return ERR_PTR(-EINVAL);
-
- objectid = key.offset;
-
- /* If we are already at the root of a subvol, return the real root */
- if (objectid == dir->i_ino)
- return dget(dir->i_sb->s_root);
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+ found_key.offset, 0, 0);
+ }
- /* Build a new key for the inode item */
- key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
-
- return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+ dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+ if (!IS_ERR(dentry))
+ dentry->d_op = &btrfs_dentry_operations;
+ return dentry;
+fail:
+ btrfs_free_path(path);
+ return ERR_PTR(ret);
}
const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 535f85ba104f..e238a0cdac67 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,12 +32,12 @@
#include "locking.h"
#include "free-space-cache.h"
-static int update_reserved_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int reserve);
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc,
int mark_free);
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins);
-
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes,
+ int is_data, int reserved,
+ struct extent_buffer **must_clean);
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
return ret;
}
-/*
- * We always set EXTENT_LOCKED for the super mirror extents so we don't
- * overwrite them, so those bits need to be unset. Also, if we are unmounting
- * with pinned extents still sitting there because we had a block group caching,
- * we need to clear those now, since we are done.
- */
-void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+static int add_excluded_extent(struct btrfs_root *root,
+ u64 start, u64 num_bytes)
{
- u64 start, end, last = 0;
- int ret;
+ u64 end = start + num_bytes - 1;
+ set_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ set_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ return 0;
+}
- while (1) {
- ret = find_first_extent_bit(&info->pinned_extents, last,
- &start, &end,
- EXTENT_LOCKED|EXTENT_DIRTY);
- if (ret)
- break;
+static void free_excluded_extents(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 start, end;
- clear_extent_bits(&info->pinned_extents, start, end,
- EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
- last = end+1;
- }
+ start = cache->key.objectid;
+ end = start + cache->key.offset - 1;
+
+ clear_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ clear_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
}
-static int remove_sb_from_cache(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache)
+static int exclude_super_stripes(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 *logical;
int stripe_len;
@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
cache->key.objectid, bytenr,
0, &logical, &nr, &stripe_len);
BUG_ON(ret);
+
while (nr--) {
- try_lock_extent(&fs_info->pinned_extents,
- logical[nr],
- logical[nr] + stripe_len - 1, GFP_NOFS);
+ cache->bytes_super += stripe_len;
+ ret = add_excluded_extent(root, logical[nr],
+ stripe_len);
+ BUG_ON(ret);
}
+
kfree(logical);
}
-
return 0;
}
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *ctl;
+
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_STARTED) {
+ spin_unlock(&cache->lock);
+ return NULL;
+ }
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ spin_unlock(&cache->lock);
+ return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+ if (atomic_dec_and_test(&ctl->count))
+ kfree(ctl);
+}
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
int ret;
while (start < end) {
- ret = find_first_extent_bit(&info->pinned_extents, start,
+ ret = find_first_extent_bit(info->pinned_extents, start,
&extent_start, &extent_end,
- EXTENT_DIRTY|EXTENT_LOCKED);
+ EXTENT_DIRTY | EXTENT_UPTODATE);
if (ret)
break;
@@ -249,22 +283,27 @@ static int caching_kthread(void *data)
{
struct btrfs_block_group_cache *block_group = data;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- u64 last = 0;
+ struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
+ struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_path *path;
- int ret = 0;
- struct btrfs_key key;
struct extent_buffer *leaf;
- int slot;
+ struct btrfs_key key;
u64 total_found = 0;
-
- BUG_ON(!fs_info);
+ u64 last = 0;
+ u32 nritems;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- atomic_inc(&block_group->space_info->caching_threads);
+ exclude_super_stripes(extent_root, block_group);
+ spin_lock(&block_group->space_info->lock);
+ block_group->space_info->bytes_super += block_group->bytes_super;
+ spin_unlock(&block_group->space_info->lock);
+
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
@@ -277,74 +316,64 @@ static int caching_kthread(void *data)
key.objectid = last;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+ key.type = BTRFS_EXTENT_ITEM_KEY;
again:
+ mutex_lock(&caching_ctl->mutex);
/* need to make sure the commit_root doesn't disappear */
down_read(&fs_info->extent_commit_sem);
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto err;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
while (1) {
smp_mb();
- if (block_group->fs_info->closing > 1) {
+ if (fs_info->closing > 1) {
last = (u64)-1;
break;
}
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(fs_info->extent_root, path);
- if (ret < 0)
- goto err;
- else if (ret)
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ } else {
+ ret = find_next_key(path, 0, &key);
+ if (ret)
break;
- if (need_resched() ||
- btrfs_transaction_in_commit(fs_info)) {
- leaf = path->nodes[0];
-
- /* this shouldn't happen, but if the
- * leaf is empty just move on.
- */
- if (btrfs_header_nritems(leaf) == 0)
- break;
- /*
- * we need to copy the key out so that
- * we are sure the next search advances
- * us forward in the btree.
- */
- btrfs_item_key_to_cpu(leaf, &key, 0);
- btrfs_release_path(fs_info->extent_root, path);
- up_read(&fs_info->extent_commit_sem);
+ caching_ctl->progress = last;
+ btrfs_release_path(extent_root, path);
+ up_read(&fs_info->extent_commit_sem);
+ mutex_unlock(&caching_ctl->mutex);
+ if (btrfs_transaction_in_commit(fs_info))
schedule_timeout(1);
- goto again;
- }
+ else
+ cond_resched();
+ goto again;
+ }
+ if (key.objectid < block_group->key.objectid) {
+ path->slots[0]++;
continue;
}
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid < block_group->key.objectid)
- goto next;
if (key.objectid >= block_group->key.objectid +
block_group->key.offset)
break;
- if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
total_found += add_new_free_space(block_group,
fs_info, last,
key.objectid);
last = key.objectid + key.offset;
- }
- if (total_found > (1024 * 1024 * 2)) {
- total_found = 0;
- wake_up(&block_group->caching_q);
+ if (total_found > (1024 * 1024 * 2)) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
}
-next:
path->slots[0]++;
}
ret = 0;
@@ -352,33 +381,65 @@ next:
total_found += add_new_free_space(block_group, fs_info, last,
block_group->key.objectid +
block_group->key.offset);
+ caching_ctl->progress = (u64)-1;
spin_lock(&block_group->lock);
+ block_group->caching_ctl = NULL;
block_group->cached = BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
err:
btrfs_free_path(path);
up_read(&fs_info->extent_commit_sem);
- atomic_dec(&block_group->space_info->caching_threads);
- wake_up(&block_group->caching_q);
+ free_excluded_extents(extent_root, block_group);
+
+ mutex_unlock(&caching_ctl->mutex);
+ wake_up(&caching_ctl->wait);
+
+ put_caching_control(caching_ctl);
+ atomic_dec(&block_group->space_info->caching_threads);
return 0;
}
static int cache_block_group(struct btrfs_block_group_cache *cache)
{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_caching_control *caching_ctl;
struct task_struct *tsk;
int ret = 0;
+ smp_mb();
+ if (cache->cached != BTRFS_CACHE_NO)
+ return 0;
+
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+ BUG_ON(!caching_ctl);
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ /* one for caching kthread, one for caching block group list */
+ atomic_set(&caching_ctl->count, 2);
+
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
spin_unlock(&cache->lock);
- return ret;
+ kfree(caching_ctl);
+ return 0;
}
+ cache->caching_ctl = caching_ctl;
cache->cached = BTRFS_CACHE_STARTED;
spin_unlock(&cache->lock);
+ down_write(&fs_info->extent_commit_sem);
+ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+ up_write(&fs_info->extent_commit_sem);
+
+ atomic_inc(&cache->space_info->caching_threads);
+
tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
cache->key.objectid);
if (IS_ERR(tsk)) {
@@ -1507,23 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-#ifdef BIO_RW_DISCARD
static void btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
DISCARD_FL_BARRIER);
}
-#endif
static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
u64 num_bytes)
{
-#ifdef BIO_RW_DISCARD
int ret;
u64 map_length = num_bytes;
struct btrfs_multi_bio *multi = NULL;
+ if (!btrfs_test_opt(root, DISCARD))
+ return 0;
+
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
bytenr, &map_length, &multi, 0);
@@ -1543,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
}
return ret;
-#else
- return 0;
-#endif
}
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -1657,7 +1715,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
parent, ref_root, flags,
ref->objectid, ref->offset,
&ins, node->ref_mod);
- update_reserved_extents(root, ins.objectid, ins.offset, 0);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent,
@@ -1783,7 +1840,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
extent_op->flags_to_set,
&extent_op->key,
ref->level, &ins);
- update_reserved_extents(root, ins.objectid, ins.offset, 0);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
@@ -1818,16 +1874,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
if (insert_reserved) {
+ int mark_free = 0;
+ struct extent_buffer *must_clean = NULL;
+
+ ret = pin_down_bytes(trans, root, NULL,
+ node->bytenr, node->num_bytes,
+ head->is_data, 1, &must_clean);
+ if (ret > 0)
+ mark_free = 1;
+
+ if (must_clean) {
+ clean_tree_block(NULL, root, must_clean);
+ btrfs_tree_unlock(must_clean);
+ free_extent_buffer(must_clean);
+ }
if (head->is_data) {
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
BUG_ON(ret);
}
- btrfs_update_pinned_extents(root, node->bytenr,
- node->num_bytes, 1);
- update_reserved_extents(root, node->bytenr,
- node->num_bytes, 0);
+ if (mark_free) {
+ ret = btrfs_free_reserved_extent(root,
+ node->bytenr,
+ node->num_bytes);
+ BUG_ON(ret);
+ }
}
mutex_unlock(&head->mutex);
return 0;
@@ -2692,60 +2764,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
alloc_target);
}
+static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+{
+ u64 num_bytes;
+ int level;
+
+ level = BTRFS_MAX_LEVEL - 2;
+ /*
+ * NOTE: these calculations are absolutely the worst possible case.
+ * This assumes that _every_ item we insert will require a new leaf, and
+ * that the tree has grown to its maximum level size.
+ */
+
+ /*
+ * for every item we insert we could insert both an extent item and a
+ * extent ref item. Then for ever item we insert, we will need to cow
+ * both the original leaf, plus the leaf to the left and right of it.
+ *
+ * Unless we are talking about the extent root, then we just want the
+ * number of items * 2, since we just need the extent item plus its ref.
+ */
+ if (root == root->fs_info->extent_root)
+ num_bytes = num_items * 2;
+ else
+ num_bytes = (num_items + (2 * num_items)) * 3;
+
+ /*
+ * num_bytes is total number of leaves we could need times the leaf
+ * size, and then for every leaf we could end up cow'ing 2 nodes per
+ * level, down to the leaf level.
+ */
+ num_bytes = (num_bytes * root->leafsize) +
+ (num_bytes * (level * 2)) * root->nodesize;
+
+ return num_bytes;
+}
+
/*
- * for now this just makes sure we have at least 5% of our metadata space free
- * for use.
+ * Unreserve metadata space for delalloc. If we have less reserved credits than
+ * we have extents, this function does nothing.
*/
-int btrfs_check_metadata_free_space(struct btrfs_root *root)
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
- u64 alloc_target, thresh;
- int committed = 0, ret;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
-again:
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+
spin_lock(&meta_sinfo->lock);
- if (!meta_sinfo->full)
- thresh = meta_sinfo->total_bytes * 80;
- else
- thresh = meta_sinfo->total_bytes * 95;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ if (BTRFS_I(inode)->reserved_extents <=
+ BTRFS_I(inode)->outstanding_extents) {
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ spin_unlock(&meta_sinfo->lock);
+ return 0;
+ }
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ BTRFS_I(inode)->reserved_extents--;
+ BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
+
+ if (meta_sinfo->bytes_delalloc < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_delalloc = 0;
+ } else {
+ meta_sinfo->bytes_delalloc -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+
+static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+{
+ u64 thresh;
+
+ thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use;
+ thresh = meta_sinfo->total_bytes - thresh;
+ thresh *= 80;
do_div(thresh, 100);
+ if (thresh <= meta_sinfo->bytes_delalloc)
+ meta_sinfo->force_delalloc = 1;
+ else
+ meta_sinfo->force_delalloc = 0;
+}
- if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
- struct btrfs_trans_handle *trans;
- if (!meta_sinfo->full) {
- meta_sinfo->force_alloc = 1;
- spin_unlock(&meta_sinfo->lock);
+struct async_flush {
+ struct btrfs_root *root;
+ struct btrfs_space_info *info;
+ struct btrfs_work work;
+};
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+static noinline void flush_delalloc_async(struct btrfs_work *work)
+{
+ struct async_flush *async;
+ struct btrfs_root *root;
+ struct btrfs_space_info *info;
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024, alloc_target, 0);
- btrfs_end_transaction(trans, root);
+ async = container_of(work, struct async_flush, work);
+ root = async->root;
+ info = async->info;
+
+ btrfs_start_delalloc_inodes(root);
+ wake_up(&info->flush_wait);
+ btrfs_wait_ordered_extents(root, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+
+ kfree(async);
+}
+
+static void wait_on_flush(struct btrfs_space_info *info)
+{
+ DEFINE_WAIT(wait);
+ u64 used;
+
+ while (1) {
+ prepare_to_wait(&info->flush_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&info->lock);
+ if (!info->flushing) {
+ spin_unlock(&info->lock);
+ break;
+ }
+
+ used = info->bytes_used + info->bytes_reserved +
+ info->bytes_pinned + info->bytes_readonly +
+ info->bytes_super + info->bytes_root +
+ info->bytes_may_use + info->bytes_delalloc;
+ if (used < info->total_bytes) {
+ spin_unlock(&info->lock);
+ break;
+ }
+ spin_unlock(&info->lock);
+ schedule();
+ }
+ finish_wait(&info->flush_wait, &wait);
+}
+
+static void flush_delalloc(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct async_flush *async;
+ bool wait = false;
+
+ spin_lock(&info->lock);
+
+ if (!info->flushing) {
+ info->flushing = 1;
+ init_waitqueue_head(&info->flush_wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_on_flush(info);
+ return;
+ }
+
+ async = kzalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ goto flush;
+
+ async->root = root;
+ async->info = info;
+ async->work.func = flush_delalloc_async;
+
+ btrfs_queue_worker(&root->fs_info->enospc_workers,
+ &async->work);
+ wait_on_flush(info);
+ return;
+
+flush:
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+}
+
+static int maybe_allocate_chunk(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ bool wait = false;
+ int ret = 0;
+ u64 min_metadata;
+ u64 free_space;
+
+ free_space = btrfs_super_total_bytes(disk_super);
+ /*
+ * we allow the metadata to grow to a max of either 5gb or 5% of the
+ * space in the volume.
+ */
+ min_metadata = min((u64)5 * 1024 * 1024 * 1024,
+ div64_u64(free_space * 5, 100));
+ if (info->total_bytes >= min_metadata) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (info->full) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (!info->allocating_chunk) {
+ info->force_alloc = 1;
+ info->allocating_chunk = 1;
+ init_waitqueue_head(&info->allocate_wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_event(info->allocate_wait,
+ !info->allocating_chunk);
+ return 1;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ 4096 + 2 * 1024 * 1024,
+ info->flags, 0);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+out:
+ spin_lock(&info->lock);
+ info->allocating_chunk = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->allocate_wait);
+
+ if (ret)
+ return 0;
+ return 1;
+}
+
+/*
+ * Reserve metadata space for delalloc.
+ */
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int flushed = 0;
+ int force_delalloc;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ force_delalloc = meta_sinfo->force_delalloc;
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!flushed)
+ meta_sinfo->bytes_delalloc += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ flushed++;
+
+ if (flushed == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ flushed++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (flushed == 2) {
+ filemap_flush(inode->i_mapping);
+ goto again;
+ } else if (flushed == 3) {
+ flush_delalloc(root, meta_sinfo);
goto again;
}
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
+ printk(KERN_ERR "enospc, has %d, reserved %d\n",
+ BTRFS_I(inode)->outstanding_extents,
+ BTRFS_I(inode)->reserved_extents);
+ dump_space_info(meta_sinfo, 0, 0);
+ return -ENOSPC;
+ }
- if (!committed) {
- committed = 1;
- trans = btrfs_join_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
+ BTRFS_I(inode)->reserved_extents++;
+ check_force_delalloc(meta_sinfo);
+ spin_unlock(&meta_sinfo->lock);
+
+ if (!flushed && force_delalloc)
+ filemap_flush(inode->i_mapping);
+
+ return 0;
+}
+
+/*
+ * unreserve num_items number of items worth of metadata space. This needs to
+ * be paired with btrfs_reserve_metadata_space.
+ *
+ * NOTE: if you have the option, run this _AFTER_ you do a
+ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
+ * oprations which will result in more used metadata, so we want to make sure we
+ * can do that without issue.
+ */
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+
+ spin_lock(&meta_sinfo->lock);
+ if (meta_sinfo->bytes_may_use < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_may_use = 0;
+ } else {
+ meta_sinfo->bytes_may_use -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+
+/*
+ * Reserve some metadata space for use. We'll calculate the worste case number
+ * of bytes that would be needed to modify num_items number of items. If we
+ * have space, fantastic, if not, you get -ENOSPC. Please call
+ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
+ * items you reserved, since whatever metadata you needed should have already
+ * been allocated.
+ *
+ * This will commit the transaction to make more space if we don't have enough
+ * metadata space. THe only time we don't do this is if we're reserving space
+ * inside of a transaction, then we will just return -ENOSPC and it is the
+ * callers responsibility to handle it properly.
+ */
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int retries = 0;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!retries)
+ meta_sinfo->bytes_may_use += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ retries++;
+ if (retries == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ retries++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (retries == 2) {
+ flush_delalloc(root, meta_sinfo);
goto again;
}
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_may_use -= num_bytes;
+ spin_unlock(&meta_sinfo->lock);
+
+ dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
+
+ check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
return 0;
@@ -2765,13 +3225,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
data_sinfo = BTRFS_I(inode)->space_info;
+ if (!data_sinfo)
+ goto alloc;
+
again:
/* make sure we have enough space to handle the data first */
spin_lock(&data_sinfo->lock);
if (data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
- data_sinfo->bytes_may_use < bytes) {
+ data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
struct btrfs_trans_handle *trans;
/*
@@ -2783,7 +3246,7 @@ again:
data_sinfo->force_alloc = 1;
spin_unlock(&data_sinfo->lock);
-
+alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
trans = btrfs_start_transaction(root, 1);
if (!trans)
@@ -2795,12 +3258,17 @@ again:
btrfs_end_transaction(trans, root);
if (ret)
return ret;
+
+ if (!data_sinfo) {
+ btrfs_set_inode_space_info(root, inode);
+ data_sinfo = BTRFS_I(inode)->space_info;
+ }
goto again;
}
spin_unlock(&data_sinfo->lock);
/* commit the current transaction and try again */
- if (!committed) {
+ if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
if (!trans)
@@ -2828,7 +3296,7 @@ again:
BTRFS_I(inode)->reserved_bytes += bytes;
spin_unlock(&data_sinfo->lock);
- return btrfs_check_metadata_free_space(root);
+ return 0;
}
/*
@@ -2927,17 +3395,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
BUG_ON(!space_info);
spin_lock(&space_info->lock);
- if (space_info->force_alloc) {
+ if (space_info->force_alloc)
force = 1;
- space_info->force_alloc = 0;
- }
if (space_info->full) {
spin_unlock(&space_info->lock);
goto out;
}
thresh = space_info->total_bytes - space_info->bytes_readonly;
- thresh = div_factor(thresh, 6);
+ thresh = div_factor(thresh, 8);
if (!force &&
(space_info->bytes_used + space_info->bytes_pinned +
space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -2951,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
* we keep a reasonable number of metadata chunks allocated in the
* FS as well.
*/
- if (flags & BTRFS_BLOCK_GROUP_DATA) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
fs_info->data_chunk_allocations++;
if (!(fs_info->data_chunk_allocations %
fs_info->metadata_ratio))
@@ -2959,8 +3425,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
}
ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
+ space_info->force_alloc = 0;
+ spin_unlock(&space_info->lock);
out:
mutex_unlock(&extent_root->fs_info->chunk_mutex);
return ret;
@@ -3009,10 +3478,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
num_bytes = min(total, cache->key.offset - byte_in_group);
if (alloc) {
old_val += num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ cache->reserved -= num_bytes;
cache->space_info->bytes_used += num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
if (cache->ro)
cache->space_info->bytes_readonly -= num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
} else {
@@ -3057,127 +3528,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return bytenr;
}
-int btrfs_update_pinned_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int pin)
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved)
{
- u64 len;
- struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache;
- if (pin)
- set_extent_dirty(&fs_info->pinned_extents,
- bytenr, bytenr + num - 1, GFP_NOFS);
-
- while (num > 0) {
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
- len = min(num, cache->key.offset -
- (bytenr - cache->key.objectid));
- if (pin) {
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned += len;
- cache->space_info->bytes_pinned += len;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fs_info->total_pinned += len;
- } else {
- int unpin = 0;
+ cache = btrfs_lookup_block_group(fs_info, bytenr);
+ BUG_ON(!cache);
- /*
- * in order to not race with the block group caching, we
- * only want to unpin the extent if we are cached. If
- * we aren't cached, we want to start async caching this
- * block group so we can free the extent the next time
- * around.
- */
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- unpin = (cache->cached == BTRFS_CACHE_FINISHED);
- if (likely(unpin)) {
- cache->pinned -= len;
- cache->space_info->bytes_pinned -= len;
- fs_info->total_pinned -= len;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ if (reserved) {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
- if (likely(unpin))
- clear_extent_dirty(&fs_info->pinned_extents,
- bytenr, bytenr + len -1,
- GFP_NOFS);
- else
- cache_block_group(cache);
+ btrfs_put_block_group(cache);
- if (unpin)
- btrfs_add_free_space(cache, bytenr, len);
- }
- btrfs_put_block_group(cache);
- bytenr += len;
- num -= len;
+ set_extent_dirty(fs_info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ return 0;
+}
+
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve)
+{
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve) {
+ cache->reserved += num_bytes;
+ cache->space_info->bytes_reserved += num_bytes;
+ } else {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
}
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
return 0;
}
-static int update_reserved_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int reserve)
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
- u64 len;
- struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_caching_control *next;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_block_group_cache *cache;
- while (num > 0) {
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
- len = min(num, cache->key.offset -
- (bytenr - cache->key.objectid));
+ down_write(&fs_info->extent_commit_sem);
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (reserve) {
- cache->reserved += len;
- cache->space_info->bytes_reserved += len;
+ list_for_each_entry_safe(caching_ctl, next,
+ &fs_info->caching_block_groups, list) {
+ cache = caching_ctl->block_group;
+ if (block_group_cache_done(cache)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ list_del_init(&caching_ctl->list);
+ put_caching_control(caching_ctl);
} else {
- cache->reserved -= len;
- cache->space_info->bytes_reserved -= len;
+ cache->last_byte_to_unpin = caching_ctl->progress;
}
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- btrfs_put_block_group(cache);
- bytenr += len;
- num -= len;
}
+
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ fs_info->pinned_extents = &fs_info->freed_extents[1];
+ else
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+ up_write(&fs_info->extent_commit_sem);
return 0;
}
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
- u64 last = 0;
- u64 start;
- u64 end;
- struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
- int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache = NULL;
+ u64 len;
- while (1) {
- ret = find_first_extent_bit(pinned_extents, last,
- &start, &end, EXTENT_DIRTY);
- if (ret)
- break;
+ while (start <= end) {
+ if (!cache ||
+ start >= cache->key.objectid + cache->key.offset) {
+ if (cache)
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_block_group(fs_info, start);
+ BUG_ON(!cache);
+ }
- set_extent_dirty(copy, start, end, GFP_NOFS);
- last = end + 1;
+ len = cache->key.objectid + cache->key.offset - start;
+ len = min(len, end + 1 - start);
+
+ if (start < cache->last_byte_to_unpin) {
+ len = min(len, cache->last_byte_to_unpin - start);
+ btrfs_add_free_space(cache, start, len);
+ }
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned -= len;
+ cache->space_info->bytes_pinned -= len;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ start += len;
}
+
+ if (cache)
+ btrfs_put_block_group(cache);
return 0;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_io_tree *unpin)
+ struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ unpin = &fs_info->freed_extents[1];
+ else
+ unpin = &fs_info->freed_extents[0];
+
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
@@ -3186,10 +3666,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = btrfs_discard_extent(root, start, end + 1 - start);
- /* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
-
+ unpin_extent_range(root, start, end);
cond_resched();
}
@@ -3199,7 +3677,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- u64 bytenr, u64 num_bytes, int is_data,
+ u64 bytenr, u64 num_bytes,
+ int is_data, int reserved,
struct extent_buffer **must_clean)
{
int err = 0;
@@ -3208,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
if (is_data)
goto pinit;
+ /*
+ * discard is sloooow, and so triggering discards on
+ * individual btree blocks isn't a good plan. Just
+ * pin everything in discard mode.
+ */
+ if (btrfs_test_opt(root, DISCARD))
+ goto pinit;
+
buf = btrfs_find_tree_block(root, bytenr, num_bytes);
if (!buf)
goto pinit;
@@ -3231,15 +3718,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
}
free_extent_buffer(buf);
pinit:
- btrfs_set_path_blocking(path);
+ if (path)
+ btrfs_set_path_blocking(path);
/* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+ btrfs_pin_extent(root, bytenr, num_bytes, reserved);
BUG_ON(err < 0);
return 0;
}
-
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -3413,7 +3900,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
ret = pin_down_bytes(trans, root, path, bytenr,
- num_bytes, is_data, &must_clean);
+ num_bytes, is_data, 0, &must_clean);
if (ret > 0)
mark_free = 1;
BUG_ON(ret < 0);
@@ -3544,8 +4031,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
/* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
- update_reserved_extents(root, bytenr, num_bytes, 0);
+ btrfs_pin_extent(root, bytenr, num_bytes, 1);
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
@@ -3585,19 +4071,33 @@ static noinline int
wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
u64 num_bytes)
{
+ struct btrfs_caching_control *caching_ctl;
DEFINE_WAIT(wait);
- prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
-
- if (block_group_cache_done(cache)) {
- finish_wait(&cache->caching_q, &wait);
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
return 0;
- }
- schedule();
- finish_wait(&cache->caching_q, &wait);
- wait_event(cache->caching_q, block_group_cache_done(cache) ||
+ wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
(cache->free_space >= num_bytes));
+
+ put_caching_control(caching_ctl);
+ return 0;
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *caching_ctl;
+ DEFINE_WAIT(wait);
+
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
+ return 0;
+
+ wait_event(caching_ctl->wait, block_group_cache_done(cache));
+
+ put_caching_control(caching_ctl);
return 0;
}
@@ -3635,6 +4135,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
int last_ptr_loop = 0;
int loop = 0;
bool found_uncached_bg = false;
+ bool failed_cluster_refill = false;
+ bool failed_alloc = false;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3732,7 +4234,16 @@ have_block_group:
if (unlikely(block_group->ro))
goto loop;
- if (last_ptr) {
+ /*
+ * Ok we want to try and use the cluster allocator, so lets look
+ * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+ * have tried the cluster allocator plenty of times at this
+ * point and not have found anything, so we are likely way too
+ * fragmented for the clustering stuff to find anything, so lets
+ * just skip it and let the allocator find whatever block it can
+ * find
+ */
+ if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
/*
* the refill lock keeps out other
* people trying to start a new cluster
@@ -3807,9 +4318,11 @@ refill_cluster:
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
- } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+ } else if (!cached && loop > LOOP_CACHING_NOWAIT
+ && !failed_cluster_refill) {
spin_unlock(&last_ptr->refill_lock);
+ failed_cluster_refill = true;
wait_block_group_cache_progress(block_group,
num_bytes + empty_cluster + empty_size);
goto have_block_group;
@@ -3821,25 +4334,30 @@ refill_cluster:
* cluster. Free the cluster we've been trying
* to use, and go to the next block group
*/
- if (loop < LOOP_NO_EMPTY_SIZE) {
- btrfs_return_cluster_to_free_space(NULL,
- last_ptr);
- spin_unlock(&last_ptr->refill_lock);
- goto loop;
- }
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
spin_unlock(&last_ptr->refill_lock);
+ goto loop;
}
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
- if (!offset && (cached || (!cached &&
- loop == LOOP_CACHING_NOWAIT))) {
- goto loop;
- } else if (!offset && (!cached &&
- loop > LOOP_CACHING_NOWAIT)) {
+ /*
+ * If we didn't find a chunk, and we haven't failed on this
+ * block group before, and this block group is in the middle of
+ * caching and we are ok with waiting, then go ahead and wait
+ * for progress to be made, and set failed_alloc to true.
+ *
+ * If failed_alloc is true then we've already waited on this
+ * block group once and should move on to the next block group.
+ */
+ if (!offset && !failed_alloc && !cached &&
+ loop > LOOP_CACHING_NOWAIT) {
wait_block_group_cache_progress(block_group,
- num_bytes + empty_size);
+ num_bytes + empty_size);
+ failed_alloc = true;
goto have_block_group;
+ } else if (!offset) {
+ goto loop;
}
checks:
search_start = stripe_align(root, offset);
@@ -3881,9 +4399,13 @@ checks:
search_start - offset);
BUG_ON(offset > search_start);
+ update_reserved_extents(block_group, num_bytes, 1);
+
/* we are all good, lets return */
break;
loop:
+ failed_cluster_refill = false;
+ failed_alloc = false;
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
@@ -3941,21 +4463,32 @@ loop:
return ret;
}
-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
+ spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
- info->bytes_pinned - info->bytes_reserved),
+ info->bytes_pinned - info->bytes_reserved -
+ info->bytes_super),
(info->full) ? "" : "not ");
printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- " may_use=%llu, used=%llu\n",
+ " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+ "\n",
(unsigned long long)info->total_bytes,
(unsigned long long)info->bytes_pinned,
(unsigned long long)info->bytes_delalloc,
(unsigned long long)info->bytes_may_use,
- (unsigned long long)info->bytes_used);
+ (unsigned long long)info->bytes_used,
+ (unsigned long long)info->bytes_root,
+ (unsigned long long)info->bytes_super,
+ (unsigned long long)info->bytes_reserved);
+ spin_unlock(&info->lock);
+
+ if (!dump_block_groups)
+ return;
down_read(&info->groups_sem);
list_for_each_entry(cache, &info->block_groups, list) {
@@ -3973,12 +4506,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
up_read(&info->groups_sem);
}
-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 min_alloc_size,
- u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins,
+ u64 data)
{
int ret;
u64 search_start = 0;
@@ -4023,7 +4556,7 @@ again:
printk(KERN_ERR "btrfs allocation failed flags %llu, "
"wanted %llu\n", (unsigned long long)data,
(unsigned long long)num_bytes);
- dump_space_info(sinfo, num_bytes);
+ dump_space_info(sinfo, num_bytes, 1);
}
return ret;
@@ -4044,25 +4577,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
+ update_reserved_extents(cache, len, 0);
btrfs_put_block_group(cache);
- update_reserved_extents(root, start, len, 0);
-
- return ret;
-}
-
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 min_alloc_size,
- u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
-{
- int ret;
- ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
- empty_size, hint_byte, search_end, ins,
- data);
- if (!ret)
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
return ret;
}
@@ -4223,15 +4739,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
+ struct btrfs_caching_control *caching_ctl;
+ u64 start = ins->objectid;
+ u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
cache_block_group(block_group);
- wait_event(block_group->caching_q,
- block_group_cache_done(block_group));
+ caching_ctl = get_caching_control(block_group);
- ret = btrfs_remove_free_space(block_group, ins->objectid,
- ins->offset);
- BUG_ON(ret);
+ if (!caching_ctl) {
+ BUG_ON(!block_group_cache_done(block_group));
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+ BUG_ON(ret);
+ } else {
+ mutex_lock(&caching_ctl->mutex);
+
+ if (start >= caching_ctl->progress) {
+ ret = add_excluded_extent(root, start, num_bytes);
+ BUG_ON(ret);
+ } else if (start + num_bytes <= caching_ctl->progress) {
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ BUG_ON(ret);
+ } else {
+ num_bytes = caching_ctl->progress - start;
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ BUG_ON(ret);
+
+ start = caching_ctl->progress;
+ num_bytes = ins->objectid + ins->offset -
+ caching_ctl->progress;
+ ret = add_excluded_extent(root, start, num_bytes);
+ BUG_ON(ret);
+ }
+
+ mutex_unlock(&caching_ctl->mutex);
+ put_caching_control(caching_ctl);
+ }
+
+ update_reserved_extents(block_group, ins->offset, 1);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -4255,9 +4802,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
int ret;
u64 flags = 0;
- ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
- empty_size, hint_byte, search_end,
- ins, 0);
+ ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+ empty_size, hint_byte, search_end,
+ ins, 0);
if (ret)
return ret;
@@ -4268,7 +4815,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
} else
BUG_ON(parent > 0);
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
@@ -4347,452 +4893,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return buf;
}
-#if 0
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *leaf)
-{
- u64 disk_bytenr;
- u64 num_bytes;
- struct btrfs_key key;
- struct btrfs_file_extent_item *fi;
- u32 nritems;
- int i;
- int ret;
-
- BUG_ON(!btrfs_is_leaf(leaf));
- nritems = btrfs_header_nritems(leaf);
-
- for (i = 0; i < nritems; i++) {
- cond_resched();
- btrfs_item_key_to_cpu(leaf, &key, i);
-
- /* only extents have references, skip everything else */
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
- continue;
-
- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-
- /* inline extents live in the btree, they don't have refs */
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
-
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-
- /* holes don't have refs */
- if (disk_bytenr == 0)
- continue;
-
- num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
- ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
- leaf->start, 0, key.objectid, 0);
- BUG_ON(ret);
- }
- return 0;
-}
-
-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_leaf_ref *ref)
-{
- int i;
- int ret;
- struct btrfs_extent_info *info;
- struct refsort *sorted;
-
- if (ref->nritems == 0)
- return 0;
-
- sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
- for (i = 0; i < ref->nritems; i++) {
- sorted[i].bytenr = ref->extents[i].bytenr;
- sorted[i].slot = i;
- }
- sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
-
- /*
- * the items in the ref were sorted when the ref was inserted
- * into the ref cache, so this is already in order
- */
- for (i = 0; i < ref->nritems; i++) {
- info = ref->extents + sorted[i].slot;
- ret = btrfs_free_extent(trans, root, info->bytenr,
- info->num_bytes, ref->bytenr,
- ref->owner, ref->generation,
- info->objectid, 0);
-
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
-
- BUG_ON(ret);
- info++;
- }
-
- kfree(sorted);
- return 0;
-}
-
-
-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 start,
- u64 len, u32 *refs)
-{
- int ret;
-
- ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
- BUG_ON(ret);
-
-#if 0 /* some debugging code in case we see problems here */
- /* if the refs count is one, it won't get increased again. But
- * if the ref count is > 1, someone may be decreasing it at
- * the same time we are.
- */
- if (*refs != 1) {
- struct extent_buffer *eb = NULL;
- eb = btrfs_find_create_tree_block(root, start, len);
- if (eb)
- btrfs_tree_lock(eb);
-
- mutex_lock(&root->fs_info->alloc_mutex);
- ret = lookup_extent_ref(NULL, root, start, len, refs);
- BUG_ON(ret);
- mutex_unlock(&root->fs_info->alloc_mutex);
-
- if (eb) {
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- }
- if (*refs == 1) {
- printk(KERN_ERR "btrfs block %llu went down to one "
- "during drop_snap\n", (unsigned long long)start);
- }
-
- }
-#endif
-
- cond_resched();
- return ret;
-}
+struct walk_control {
+ u64 refs[BTRFS_MAX_LEVEL];
+ u64 flags[BTRFS_MAX_LEVEL];
+ struct btrfs_key update_progress;
+ int stage;
+ int level;
+ int shared_level;
+ int update_ref;
+ int keep_locks;
+ int reada_slot;
+ int reada_count;
+};
+#define DROP_REFERENCE 1
+#define UPDATE_BACKREF 2
-/*
- * this is used while deleting old snapshots, and it drops the refs
- * on a whole subtree starting from a level 1 node.
- *
- * The idea is to sort all the leaf pointers, and then drop the
- * ref on all the leaves in order. Most of the time the leaves
- * will have ref cache entries, so no leaf IOs will be required to
- * find the extents they have references on.
- *
- * For each leaf, any references it has are also dropped in order
- *
- * This ends up dropping the references in something close to optimal
- * order for reading and modifying the extent allocation tree.
- */
-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path)
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct walk_control *wc,
+ struct btrfs_path *path)
{
u64 bytenr;
- u64 root_owner;
- u64 root_gen;
- struct extent_buffer *eb = path->nodes[1];
- struct extent_buffer *leaf;
- struct btrfs_leaf_ref *ref;
- struct refsort *sorted = NULL;
- int nritems = btrfs_header_nritems(eb);
+ u64 generation;
+ u64 refs;
+ u64 flags;
+ u64 last = 0;
+ u32 nritems;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
int ret;
- int i;
- int refi = 0;
- int slot = path->slots[1];
- u32 blocksize = btrfs_level_size(root, 0);
- u32 refs;
-
- if (nritems == 0)
- goto out;
-
- root_owner = btrfs_header_owner(eb);
- root_gen = btrfs_header_generation(eb);
- sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+ int slot;
+ int nread = 0;
- /*
- * step one, sort all the leaf pointers so we don't scribble
- * randomly into the extent allocation tree
- */
- for (i = slot; i < nritems; i++) {
- sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
- sorted[refi].slot = i;
- refi++;
+ if (path->slots[wc->level] < wc->reada_slot) {
+ wc->reada_count = wc->reada_count * 2 / 3;
+ wc->reada_count = max(wc->reada_count, 2);
+ } else {
+ wc->reada_count = wc->reada_count * 3 / 2;
+ wc->reada_count = min_t(int, wc->reada_count,
+ BTRFS_NODEPTRS_PER_BLOCK(root));
}
- /*
- * nritems won't be zero, but if we're picking up drop_snapshot
- * after a crash, slot might be > 0, so double check things
- * just in case.
- */
- if (refi == 0)
- goto out;
+ eb = path->nodes[wc->level];
+ nritems = btrfs_header_nritems(eb);
+ blocksize = btrfs_level_size(root, wc->level - 1);
- sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+ for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+ if (nread >= wc->reada_count)
+ break;
- /*
- * the first loop frees everything the leaves point to
- */
- for (i = 0; i < refi; i++) {
- u64 ptr_gen;
+ cond_resched();
+ bytenr = btrfs_node_blockptr(eb, slot);
+ generation = btrfs_node_ptr_generation(eb, slot);
- bytenr = sorted[i].bytenr;
+ if (slot == path->slots[wc->level])
+ goto reada;
- /*
- * check the reference count on this leaf. If it is > 1
- * we just decrement it below and don't update any
- * of the refs the leaf points to.
- */
- ret = drop_snap_lookup_refcount(trans, root, bytenr,
- blocksize, &refs);
- BUG_ON(ret);
- if (refs != 1)
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset)
continue;
- ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
-
- /*
- * the leaf only had one reference, which means the
- * only thing pointing to this leaf is the snapshot
- * we're deleting. It isn't possible for the reference
- * count to increase again later
- *
- * The reference cache is checked for the leaf,
- * and if found we'll be able to drop any refs held by
- * the leaf without needing to read it in.
- */
- ref = btrfs_lookup_leaf_ref(root, bytenr);
- if (ref && ref->generation != ptr_gen) {
- btrfs_free_leaf_ref(root, ref);
- ref = NULL;
- }
- if (ref) {
- ret = cache_drop_leaf_ref(trans, root, ref);
- BUG_ON(ret);
- btrfs_remove_leaf_ref(root, ref);
- btrfs_free_leaf_ref(root, ref);
- } else {
- /*
- * the leaf wasn't in the reference cache, so
- * we have to read it.
- */
- leaf = read_tree_block(root, bytenr, blocksize,
- ptr_gen);
- ret = btrfs_drop_leaf_ref(trans, root, leaf);
- BUG_ON(ret);
- free_extent_buffer(leaf);
- }
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
- }
-
- /*
- * run through the loop again to free the refs on the leaves.
- * This is faster than doing it in the loop above because
- * the leaves are likely to be clustered together. We end up
- * working in nice chunks on the extent allocation tree.
- */
- for (i = 0; i < refi; i++) {
- bytenr = sorted[i].bytenr;
- ret = btrfs_free_extent(trans, root, bytenr,
- blocksize, eb->start,
- root_owner, root_gen, 0, 1);
+ /* We don't lock the tree block, it's OK to be racy here */
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &refs, &flags);
BUG_ON(ret);
+ BUG_ON(refs == 0);
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
- }
-out:
- kfree(sorted);
-
- /*
- * update the path to show we've processed the entire level 1
- * node. This will get saved into the root's drop_snapshot_progress
- * field so these drops are not repeated again if this transaction
- * commits.
- */
- path->slots[1] = nritems;
- return 0;
-}
-
-/*
- * helper function for drop_snapshot, this walks down the tree dropping ref
- * counts as it goes.
- */
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level)
-{
- u64 root_owner;
- u64 root_gen;
- u64 bytenr;
- u64 ptr_gen;
- struct extent_buffer *next;
- struct extent_buffer *cur;
- struct extent_buffer *parent;
- u32 blocksize;
- int ret;
- u32 refs;
-
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
- ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
- path->nodes[*level]->len, &refs);
- BUG_ON(ret);
- if (refs > 1)
- goto out;
-
- /*
- * walk down to the last node level and free all the leaves
- */
- while (*level >= 0) {
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
- cur = path->nodes[*level];
-
- if (btrfs_header_level(cur) != *level)
- WARN_ON(1);
-
- if (path->slots[*level] >=
- btrfs_header_nritems(cur))
- break;
+ if (wc->stage == DROP_REFERENCE) {
+ if (refs == 1)
+ goto reada;
- /* the new code goes down to level 1 and does all the
- * leaves pointed to that node in bulk. So, this check
- * for level 0 will always be false.
- *
- * But, the disk format allows the drop_snapshot_progress
- * field in the root to leave things in a state where
- * a leaf will need cleaning up here. If someone crashes
- * with the old code and then boots with the new code,
- * we might find a leaf here.
- */
- if (*level == 0) {
- ret = btrfs_drop_leaf_ref(trans, root, cur);
- BUG_ON(ret);
- break;
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ continue;
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ ret = btrfs_comp_cpu_keys(&key,
+ &wc->update_progress);
+ if (ret < 0)
+ continue;
+ } else {
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
}
-
- /*
- * once we get to level one, process the whole node
- * at once, including everything below it.
- */
- if (*level == 1) {
- ret = drop_level_one_refs(trans, root, path);
- BUG_ON(ret);
+reada:
+ ret = readahead_tree_block(root, bytenr, blocksize,
+ generation);
+ if (ret)
break;
- }
-
- bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
-
- ret = drop_snap_lookup_refcount(trans, root, bytenr,
- blocksize, &refs);
- BUG_ON(ret);
-
- /*
- * if there is more than one reference, we don't need
- * to read that node to drop any references it has. We
- * just drop the ref we hold on that node and move on to the
- * next slot in this level.
- */
- if (refs != 1) {
- parent = path->nodes[*level];
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
- path->slots[*level]++;
-
- ret = btrfs_free_extent(trans, root, bytenr,
- blocksize, parent->start,
- root_owner, root_gen,
- *level - 1, 1);
- BUG_ON(ret);
-
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
-
- continue;
- }
-
- /*
- * we need to keep freeing things in the next level down.
- * read the block and loop around to process it
- */
- next = read_tree_block(root, bytenr, blocksize, ptr_gen);
- WARN_ON(*level <= 0);
- if (path->nodes[*level-1])
- free_extent_buffer(path->nodes[*level-1]);
- path->nodes[*level-1] = next;
- *level = btrfs_header_level(next);
- path->slots[*level] = 0;
- cond_resched();
+ last = bytenr + blocksize;
+ nread++;
}
-out:
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
- if (path->nodes[*level] == root->node) {
- parent = path->nodes[*level];
- bytenr = path->nodes[*level]->start;
- } else {
- parent = path->nodes[*level + 1];
- bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
- }
-
- blocksize = btrfs_level_size(root, *level);
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
-
- /*
- * cleanup and free the reference on the last node
- * we processed
- */
- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
- parent->start, root_owner, root_gen,
- *level, 1);
- free_extent_buffer(path->nodes[*level]);
- path->nodes[*level] = NULL;
-
- *level += 1;
- BUG_ON(ret);
-
- cond_resched();
- return 0;
+ wc->reada_slot = slot;
}
-#endif
-
-struct walk_control {
- u64 refs[BTRFS_MAX_LEVEL];
- u64 flags[BTRFS_MAX_LEVEL];
- struct btrfs_key update_progress;
- int stage;
- int level;
- int shared_level;
- int update_ref;
- int keep_locks;
-};
-
-#define DROP_REFERENCE 1
-#define UPDATE_BACKREF 2
/*
* hepler to process tree block while walking down the tree.
*
- * when wc->stage == DROP_REFERENCE, this function checks
- * reference count of the block. if the block is shared and
- * we need update back refs for the subtree rooted at the
- * block, this function changes wc->stage to UPDATE_BACKREF
- *
* when wc->stage == UPDATE_BACKREF, this function updates
* back refs for pointers in the block.
*
@@ -4801,11 +5003,10 @@ struct walk_control {
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc)
+ struct walk_control *wc, int lookup_info)
{
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
- struct btrfs_key key;
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
int ret;
@@ -4817,8 +5018,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
* when reference count of tree block is 1, it won't increase
* again. once full backref flag is set, we never clear it.
*/
- if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
- (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+ if (lookup_info &&
+ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
BUG_ON(!path->locks[level]);
ret = btrfs_lookup_extent_info(trans, root,
eb->start, eb->len,
@@ -4828,21 +5030,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(wc->refs[level] == 0);
}
- if (wc->stage == DROP_REFERENCE &&
- wc->update_ref && wc->refs[level] > 1) {
- BUG_ON(eb == root->node);
- BUG_ON(path->slots[level] > 0);
- if (level == 0)
- btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
- else
- btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
- if (btrfs_header_owner(eb) == root->root_key.objectid &&
- btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
- wc->stage = UPDATE_BACKREF;
- wc->shared_level = level;
- }
- }
-
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level] > 1)
return 1;
@@ -4879,6 +5066,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
}
/*
+ * hepler to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int *lookup_info)
+{
+ u64 bytenr;
+ u64 generation;
+ u64 parent;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *next;
+ int level = wc->level;
+ int reada = 0;
+ int ret = 0;
+
+ generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+ /*
+ * if the lower level block was created before the snapshot
+ * was created, we know there is no need to update back refs
+ * for the subtree
+ */
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset) {
+ *lookup_info = 1;
+ return 1;
+ }
+
+ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ blocksize = btrfs_level_size(root, level - 1);
+
+ next = btrfs_find_tree_block(root, bytenr, blocksize);
+ if (!next) {
+ next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ reada = 1;
+ }
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &wc->refs[level - 1],
+ &wc->flags[level - 1]);
+ BUG_ON(ret);
+ BUG_ON(wc->refs[level - 1] == 0);
+ *lookup_info = 0;
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level - 1] > 1) {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ goto skip;
+
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+ if (ret < 0)
+ goto skip;
+
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
+ }
+ } else {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+ }
+
+ if (!btrfs_buffer_uptodate(next, generation)) {
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ next = NULL;
+ *lookup_info = 1;
+ }
+
+ if (!next) {
+ if (reada && level == 1)
+ reada_walk_down(trans, root, wc, path);
+ next = read_tree_block(root, bytenr, blocksize, generation);
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ }
+
+ level--;
+ BUG_ON(level != btrfs_header_level(next));
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ path->locks[level] = 1;
+ wc->level = level;
+ if (wc->level == 1)
+ wc->reada_slot = 0;
+ return 0;
+skip:
+ wc->refs[level - 1] = 0;
+ wc->flags[level - 1] = 0;
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ parent = path->nodes[level]->start;
+ } else {
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level]));
+ parent = 0;
+ }
+
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+ root->root_key.objectid, level - 1, 0);
+ BUG_ON(ret);
+ }
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ *lookup_info = 1;
+ return 1;
+}
+
+/*
* hepler to process tree block while walking up the tree.
*
* when wc->stage == DROP_REFERENCE, this function drops
@@ -4905,7 +5222,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (level < wc->shared_level)
goto out;
- BUG_ON(wc->refs[level] <= 1);
ret = find_next_key(path, level + 1, &wc->update_progress);
if (ret > 0)
wc->update_ref = 0;
@@ -4936,8 +5252,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
return 1;
}
- } else {
- BUG_ON(level != 0);
}
}
@@ -4990,39 +5304,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc)
{
- struct extent_buffer *next;
- struct extent_buffer *cur;
- u64 bytenr;
- u64 ptr_gen;
- u32 blocksize;
int level = wc->level;
+ int lookup_info = 1;
int ret;
while (level >= 0) {
- cur = path->nodes[level];
- BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
+ if (path->slots[level] >=
+ btrfs_header_nritems(path->nodes[level]))
+ break;
- ret = walk_down_proc(trans, root, path, wc);
+ ret = walk_down_proc(trans, root, path, wc, lookup_info);
if (ret > 0)
break;
if (level == 0)
break;
- bytenr = btrfs_node_blockptr(cur, path->slots[level]);
- blocksize = btrfs_level_size(root, level - 1);
- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
-
- next = read_tree_block(root, bytenr, blocksize, ptr_gen);
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
-
- level--;
- BUG_ON(level != btrfs_header_level(next));
- path->nodes[level] = next;
- path->slots[level] = 0;
- path->locks[level] = 1;
- wc->level = level;
+ ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ if (ret > 0) {
+ path->slots[level]++;
+ continue;
+ }
+ level = wc->level;
}
return 0;
}
@@ -5112,9 +5415,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
err = ret;
goto out;
}
- btrfs_node_key_to_cpu(path->nodes[level], &key,
- path->slots[level]);
- WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
+ WARN_ON(ret > 0);
/*
* unlock our path, this is safe because only this
@@ -5149,6 +5450,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
wc->stage = DROP_REFERENCE;
wc->update_ref = update_ref;
wc->keep_locks = 0;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
ret = walk_down_tree(trans, root, path, wc);
@@ -5201,9 +5503,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
ret = btrfs_del_root(trans, tree_root, &root->root_key);
BUG_ON(ret);
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- kfree(root);
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
+ NULL, NULL);
+ BUG_ON(ret < 0);
+ if (ret > 0) {
+ ret = btrfs_del_orphan_item(trans, tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ }
+ }
+
+ if (root->in_radix) {
+ btrfs_free_fs_root(tree_root->fs_info, root);
+ } else {
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ kfree(root);
+ }
out:
btrfs_end_transaction(trans, tree_root);
kfree(wc);
@@ -5255,6 +5572,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->stage = DROP_REFERENCE;
wc->update_ref = 0;
wc->keep_locks = 1;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
wret = walk_down_tree(trans, root, path, wc);
@@ -5397,9 +5715,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
while (1) {
int ret;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -6842,287 +7160,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
return 0;
}
-#if 0
-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 size)
-{
- struct btrfs_path *path;
- struct btrfs_inode_item *item;
- struct extent_buffer *leaf;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- path->leave_spinning = 1;
- ret = btrfs_insert_empty_inode(trans, root, path, objectid);
- if (ret)
- goto out;
-
- leaf = path->nodes[0];
- item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
- memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
- btrfs_set_inode_generation(leaf, item, 1);
- btrfs_set_inode_size(leaf, item, size);
- btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root, path);
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
{
- struct inode *inode = NULL;
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root;
- struct btrfs_key root_key;
- u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
- int err = 0;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_device *device;
+ int full = 0;
+ int ret = 0;
- root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &root_key);
- if (IS_ERR(root))
- return ERR_CAST(root);
+ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
+ /* odd, couldn't find the block group, leave it alone */
+ if (!block_group)
+ return -1;
- err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
- if (err)
+ /* no bytes used, we're good */
+ if (!btrfs_block_group_used(&block_group->item))
goto out;
- err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
- BUG_ON(err);
-
- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
- group->key.offset, 0, group->key.offset,
- 0, 0, 0);
- BUG_ON(err);
-
- inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
- if (inode->i_state & I_NEW) {
- BTRFS_I(inode)->root = root;
- BTRFS_I(inode)->location.objectid = objectid;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
- btrfs_read_locked_inode(inode);
- unlock_new_inode(inode);
- BUG_ON(is_bad_inode(inode));
- } else {
- BUG_ON(1);
- }
- BTRFS_I(inode)->index_cnt = group->key.objectid;
-
- err = btrfs_orphan_add(trans, inode);
-out:
- btrfs_end_transaction(trans, root);
- if (err) {
- if (inode)
- iput(inode);
- inode = ERR_PTR(err);
- }
- return inode;
-}
-
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
-{
-
- struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct list_head list;
- size_t offset;
- int ret;
- u64 disk_bytenr;
-
- INIT_LIST_HEAD(&list);
-
- ordered = btrfs_lookup_ordered_extent(inode, file_pos);
- BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
-
- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
- ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list);
-
- while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
- list_del_init(&sums->list);
-
- sector_sum = sums->sums;
- sums->bytenr = ordered->start;
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
- offset = 0;
- while (offset < sums->len) {
- sector_sum->bytenr += ordered->start - disk_bytenr;
- sector_sum++;
- offset += root->sectorsize;
- }
+ full = space_info->full;
- btrfs_add_ordered_sum(inode, ordered, sums);
+ /*
+ * if this is the last block group we have in this space, we can't
+ * relocate it unless we're able to allocate a new chunk below.
+ *
+ * Otherwise, we need to make sure we have room in the space to handle
+ * all of the extents from this block group. If we can, we're good
+ */
+ if ((space_info->total_bytes != block_group->key.offset) &&
+ (space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ btrfs_block_group_used(&block_group->item) <
+ space_info->total_bytes)) {
+ spin_unlock(&space_info->lock);
+ goto out;
}
- btrfs_put_ordered_extent(ordered);
- return 0;
-}
-
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_path *path;
- struct btrfs_fs_info *info = root->fs_info;
- struct extent_buffer *leaf;
- struct inode *reloc_inode;
- struct btrfs_block_group_cache *block_group;
- struct btrfs_key key;
- u64 skipped;
- u64 cur_byte;
- u64 total_found;
- u32 nritems;
- int ret;
- int progress;
- int pass = 0;
-
- root = root->fs_info->extent_root;
-
- block_group = btrfs_lookup_block_group(info, group_start);
- BUG_ON(!block_group);
-
- printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
- (unsigned long long)block_group->key.objectid,
- (unsigned long long)block_group->flags);
-
- path = btrfs_alloc_path();
- BUG_ON(!path);
-
- reloc_inode = create_reloc_inode(info, block_group);
- BUG_ON(IS_ERR(reloc_inode));
-
- __alloc_chunk_for_shrink(root, block_group, 1);
- set_block_group_readonly(block_group);
-
- btrfs_start_delalloc_inodes(info->tree_root);
- btrfs_wait_ordered_extents(info->tree_root, 0);
-again:
- skipped = 0;
- total_found = 0;
- progress = 0;
- key.objectid = block_group->key.objectid;
- key.offset = 0;
- key.type = 0;
- cur_byte = key.objectid;
-
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
+ spin_unlock(&space_info->lock);
- mutex_lock(&root->fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(info->tree_root);
- btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
- mutex_unlock(&root->fs_info->cleaner_mutex);
+ /*
+ * ok we don't have enough space, but maybe we have free space on our
+ * devices to allocate new chunks for relocation, so loop through our
+ * alloc devices and guess if we have enough space. However, if we
+ * were marked as full, then we know there aren't enough chunks, and we
+ * can just return.
+ */
+ ret = -1;
+ if (full)
+ goto out;
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
+ mutex_lock(&root->fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ u64 min_free = btrfs_block_group_used(&block_group->item);
+ u64 dev_offset, max_avail;
- while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-next:
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret == 1) {
- ret = 0;
+ /*
+ * check to make sure we can actually find a chunk with enough
+ * space to fit our block group in.
+ */
+ if (device->total_bytes > device->bytes_used + min_free) {
+ ret = find_free_dev_extent(NULL, device, min_free,
+ &dev_offset, &max_avail);
+ if (!ret)
break;
- }
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
- if (key.objectid >= block_group->key.objectid +
- block_group->key.offset)
- break;
-
- if (progress && need_resched()) {
- btrfs_release_path(root, path);
- cond_resched();
- progress = 0;
- continue;
- }
- progress = 1;
-
- if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
- key.objectid + key.offset <= cur_byte) {
- path->slots[0]++;
- goto next;
+ ret = -1;
}
-
- total_found++;
- cur_byte = key.objectid + key.offset;
- btrfs_release_path(root, path);
-
- __alloc_chunk_for_shrink(root, block_group, 0);
- ret = relocate_one_extent(root, path, &key, block_group,
- reloc_inode, pass);
- BUG_ON(ret < 0);
- if (ret > 0)
- skipped++;
-
- key.objectid = cur_byte;
- key.type = 0;
- key.offset = 0;
}
-
- btrfs_release_path(root, path);
-
- if (pass == 0) {
- btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
- invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
- }
-
- if (total_found > 0) {
- printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
- (unsigned long long)total_found, pass);
- pass++;
- if (total_found == skipped && pass > 2) {
- iput(reloc_inode);
- reloc_inode = create_reloc_inode(info, block_group);
- pass = 0;
- }
- goto again;
- }
-
- /* delete reloc_inode */
- iput(reloc_inode);
-
- /* unpin extents in this range */
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
-
- spin_lock(&block_group->lock);
- WARN_ON(block_group->pinned > 0);
- WARN_ON(block_group->reserved > 0);
- WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
- spin_unlock(&block_group->lock);
- btrfs_put_block_group(block_group);
- ret = 0;
+ mutex_unlock(&root->fs_info->chunk_mutex);
out:
- btrfs_free_path(path);
+ btrfs_put_block_group(block_group);
return ret;
}
-#endif
static int find_first_block_group(struct btrfs_root *root,
struct btrfs_path *path, struct btrfs_key *key)
@@ -7165,8 +7282,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_space_info *space_info;
+ struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
+ down_write(&info->extent_commit_sem);
+ while (!list_empty(&info->caching_block_groups)) {
+ caching_ctl = list_entry(info->caching_block_groups.next,
+ struct btrfs_caching_control, list);
+ list_del(&caching_ctl->list);
+ put_caching_control(caching_ctl);
+ }
+ up_write(&info->extent_commit_sem);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -7180,8 +7307,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
- wait_event(block_group->caching_q,
- block_group_cache_done(block_group));
+ wait_block_group_cache_done(block_group);
btrfs_remove_free_space_cache(block_group);
@@ -7251,7 +7377,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
cache->fs_info = info;
- init_waitqueue_head(&cache->caching_q);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
@@ -7273,8 +7398,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
cache->flags = btrfs_block_group_flags(&cache->item);
cache->sectorsize = root->sectorsize;
- remove_sb_from_cache(root, cache);
-
/*
* check for two cases, either we are full, and therefore
* don't need to bother with the caching work since we won't
@@ -7283,13 +7406,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* time, particularly in the full case.
*/
if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+ exclude_super_stripes(root, cache);
+ cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ free_excluded_extents(root, cache);
} else if (btrfs_block_group_used(&cache->item) == 0) {
+ exclude_super_stripes(root, cache);
+ cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
add_new_free_space(cache, root->fs_info,
found_key.objectid,
found_key.objectid +
found_key.offset);
+ free_excluded_extents(root, cache);
}
ret = update_space_info(info, cache->flags, found_key.offset,
@@ -7297,6 +7426,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
&space_info);
BUG_ON(ret);
cache->space_info = space_info;
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_super += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
down_write(&space_info->groups_sem);
list_add_tail(&cache->list, &space_info->block_groups);
up_write(&space_info->groups_sem);
@@ -7346,7 +7479,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
- init_waitqueue_head(&cache->caching_q);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
@@ -7355,15 +7487,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
btrfs_set_block_group_flags(&cache->item, type);
+ cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- remove_sb_from_cache(root, cache);
+ exclude_super_stripes(root, cache);
add_new_free_space(cache, root->fs_info, chunk_offset,
chunk_offset + size);
+ free_excluded_extents(root, cache);
+
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
BUG_ON(ret);
+
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_super += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
down_write(&cache->space_info->groups_sem);
list_add_tail(&cache->list, &cache->space_info->block_groups);
up_write(&cache->space_info->groups_sem);
@@ -7429,8 +7569,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
- wait_event(block_group->caching_q,
- block_group_cache_done(block_group));
+ wait_block_group_cache_done(block_group);
btrfs_remove_free_space_cache(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68260180f587..96577e8bf9fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
return NULL;
}
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+ struct extent_state *other)
+{
+ if (tree->ops && tree->ops->merge_extent_hook)
+ tree->ops->merge_extent_hook(tree->mapping->host, new,
+ other);
+}
+
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
+ merge_cb(tree, state, other);
state->start = other->start;
other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
+ merge_cb(tree, state, other);
other->start = state->start;
state->tree = NULL;
rb_erase(&state->rb_node, &tree->state);
free_extent_state(state);
+ state = NULL;
}
}
+
return 0;
}
-static void set_state_cb(struct extent_io_tree *tree,
+static int set_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
- tree->ops->set_bit_hook(tree->mapping->host, state->start,
- state->end, state->state, bits);
+ return tree->ops->set_bit_hook(tree->mapping->host,
+ state->start, state->end,
+ state->state, bits);
}
+
+ return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
- if (tree->ops && tree->ops->clear_bit_hook) {
- tree->ops->clear_bit_hook(tree->mapping->host, state->start,
- state->end, state->state, bits);
- }
+ if (tree->ops && tree->ops->clear_bit_hook)
+ tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
int bits)
{
struct rb_node *node;
+ int ret;
if (end < start) {
printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
(unsigned long long)start);
WARN_ON(1);
}
+ state->start = start;
+ state->end = end;
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
if (bits & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
- set_state_cb(tree, state, bits);
state->state |= bits;
- state->start = start;
- state->end = end;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
}
+static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+ u64 split)
+{
+ if (tree->ops && tree->ops->split_extent_hook)
+ return tree->ops->split_extent_hook(tree->mapping->host,
+ orig, split);
+ return 0;
+}
+
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
struct rb_node *node;
+
+ split_cb(tree, orig, split);
+
prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state, int bits, int wake,
int delete)
{
- int ret = state->state & bits;
+ int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+ int ret = state->state & bits_to_clear;
if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
- state->state &= ~bits;
+ state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
if (delete || state->state == 0) {
@@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
* bits were already set, or zero if none of the bits were already set.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int wake, int delete, gfp_t mask)
+ int bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask)
{
struct extent_state *state;
+ struct extent_state *cached;
struct extent_state *prealloc = NULL;
+ struct rb_node *next_node;
struct rb_node *node;
u64 last_end;
int err;
@@ -488,6 +522,17 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+ *cached_state = NULL;
+ cached_state = NULL;
+ if (cached && cached->tree && cached->start == start) {
+ atomic_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ free_extent_state(cached);
+ }
/*
* this search will find the extents that end after
* our range starts
@@ -496,6 +541,7 @@ again:
if (!node)
goto out;
state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
if (state->start > end)
goto out;
WARN_ON(state->end < start);
@@ -526,13 +572,11 @@ again:
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, bits,
- wake, delete);
+ set |= clear_state_bit(tree, state, bits, wake,
+ delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
- } else {
- start = state->start;
}
goto search_again;
}
@@ -547,19 +591,30 @@ again:
prealloc = alloc_extent_state(GFP_ATOMIC);
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
-
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, bits,
- wake, delete);
+
+ set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+
prealloc = NULL;
goto out;
}
+ if (state->end < end && prealloc && !need_resched())
+ next_node = rb_next(&state->rb_node);
+ else
+ next_node = NULL;
+
set |= clear_state_bit(tree, state, bits, wake, delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ if (start <= end && next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
goto search_again;
out:
@@ -641,40 +696,59 @@ out:
return 0;
}
-static void set_state_bits(struct extent_io_tree *tree,
+static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
int bits)
{
+ int ret;
+
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- set_state_cb(tree, state, bits);
state->state |= bits;
+
+ return 0;
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+ *cached_ptr = state;
+ atomic_inc(&state->refs);
+ }
+ }
}
/*
- * set some bits on a range in the tree. This may require allocations
- * or sleeping, so the gfp mask is used to indicate what is allowed.
+ * set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
*
- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
- * range already has the desired bits set. The start of the existing
- * range is returned in failed_start in this case.
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
*
- * [start, end] is inclusive
- * This takes the tree lock.
+ * [start, end] is inclusive This takes the tree lock.
*/
+
static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive, u64 *failed_start,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state,
gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
int err = 0;
- int set;
u64 last_start;
u64 last_end;
+
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -683,6 +757,13 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start == start && state->tree) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
/*
* this search will find all the extents that end after
* our range starts.
@@ -694,8 +775,8 @@ again:
BUG_ON(err == -EEXIST);
goto out;
}
-
state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
last_start = state->start;
last_end = state->end;
@@ -706,17 +787,32 @@ again:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set = state->state & bits;
- if (set && exclusive) {
+ struct rb_node *next_node;
+ if (state->state & exclusive_bits) {
*failed_start = state->start;
err = -EEXIST;
goto out;
}
- set_state_bits(tree, state, bits);
+
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+
+ cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
+
start = last_end + 1;
+ if (start < end && prealloc && !need_resched()) {
+ next_node = rb_next(node);
+ if (next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ }
goto search_again;
}
@@ -737,8 +833,7 @@ again:
* desired bit on it.
*/
if (state->start < start) {
- set = state->state & bits;
- if (exclusive && set) {
+ if (state->state & exclusive_bits) {
*failed_start = start;
err = -EEXIST;
goto out;
@@ -749,13 +844,14 @@ again:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+ cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
- } else {
- start = state->start;
}
goto search_again;
}
@@ -774,10 +870,13 @@ again:
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
bits);
- prealloc = NULL;
BUG_ON(err == -EEXIST);
- if (err)
+ if (err) {
+ prealloc = NULL;
goto out;
+ }
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -788,8 +887,7 @@ again:
* on the first half
*/
if (state->start <= end && state->end > end) {
- set = state->state & bits;
- if (exclusive && set) {
+ if (state->state & exclusive_bits) {
*failed_start = start;
err = -EEXIST;
goto out;
@@ -797,7 +895,12 @@ again:
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- set_state_bits(tree, prealloc, bits);
+ err = set_state_bits(tree, prealloc, bits);
+ if (err) {
+ prealloc = NULL;
+ goto out;
+ }
+ cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
@@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
- mask);
-}
-
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+ NULL, mask);
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
return set_extent_bit(tree, start, end, bits, 0, NULL,
- mask);
+ NULL, mask);
}
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+ return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DIRTY,
- 0, NULL, mask);
+ EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ 0, NULL, NULL, mask);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
-}
-
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0,
+ NULL, mask);
}
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
- mask);
+ NULL, mask);
}
static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
+ NULL, mask);
}
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
- mask);
+ NULL, mask);
}
static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
-}
-
-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
- 0, NULL, mask);
-}
-
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ NULL, mask);
}
int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, struct extent_state **cached_state, gfp_t mask)
{
int err;
u64 failed_start;
while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
- &failed_start, mask);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, mask);
if (err == -EEXIST && (mask & __GFP_WAIT)) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
return err;
}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+ return lock_extent_bits(tree, start, end, 0, NULL, mask);
+}
+
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
int err;
u64 failed_start;
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
- &failed_start, mask);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, mask);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, mask);
+ EXTENT_LOCKED, 1, 0, NULL, mask);
return 0;
}
return 1;
}
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ mask);
}
/*
@@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- set_extent_dirty(tree, start, end, GFP_NOFS);
return 0;
}
@@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- set_extent_writeback(tree, start, end, GFP_NOFS);
return 0;
}
@@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 delalloc_start;
u64 delalloc_end;
u64 found;
+ struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
@@ -1269,6 +1365,7 @@ again:
/* some of the pages are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
*/
+ free_extent_state(cached_state);
if (!loops) {
unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1282,18 +1379,21 @@ again:
BUG_ON(ret);
/* step three, lock the state bits for the whole range */
- lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+ lock_extent_bits(tree, delalloc_start, delalloc_end,
+ 0, &cached_state, GFP_NOFS);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, 1);
+ EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
- unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+ unlock_extent_cached(tree, delalloc_start, delalloc_end,
+ &cached_state, GFP_NOFS);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
+ free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
@@ -1303,11 +1403,7 @@ out_failed:
int extent_clear_unlock_delalloc(struct inode *inode,
struct extent_io_tree *tree,
u64 start, u64 end, struct page *locked_page,
- int unlock_pages,
- int clear_unlock,
- int clear_delalloc, int clear_dirty,
- int set_writeback,
- int end_writeback)
+ unsigned long op)
{
int ret;
struct page *pages[16];
@@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
int i;
int clear_bits = 0;
- if (clear_unlock)
+ if (op & EXTENT_CLEAR_UNLOCK)
clear_bits |= EXTENT_LOCKED;
- if (clear_dirty)
+ if (op & EXTENT_CLEAR_DIRTY)
clear_bits |= EXTENT_DIRTY;
- if (clear_delalloc)
+ if (op & EXTENT_CLEAR_DELALLOC)
clear_bits |= EXTENT_DELALLOC;
- clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
- if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+ if (op & EXTENT_CLEAR_ACCOUNTING)
+ clear_bits |= EXTENT_DO_ACCOUNTING;
+
+ clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+ if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
+ EXTENT_SET_PRIVATE2)))
return 0;
while (nr_pages > 0) {
@@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
min_t(unsigned long,
nr_pages, ARRAY_SIZE(pages)), pages);
for (i = 0; i < ret; i++) {
+
+ if (op & EXTENT_SET_PRIVATE2)
+ SetPagePrivate2(pages[i]);
+
if (pages[i] == locked_page) {
page_cache_release(pages[i]);
continue;
}
- if (clear_dirty)
+ if (op & EXTENT_CLEAR_DIRTY)
clear_page_dirty_for_io(pages[i]);
- if (set_writeback)
+ if (op & EXTENT_SET_WRITEBACK)
set_page_writeback(pages[i]);
- if (end_writeback)
+ if (op & EXTENT_END_WRITEBACK)
end_page_writeback(pages[i]);
- if (unlock_pages)
+ if (op & EXTENT_CLEAR_UNLOCK_PAGE)
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
@@ -1476,14 +1581,17 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int filled)
+ int bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
int bitset = 0;
spin_lock(&tree->lock);
- node = tree_search(tree, start);
+ if (cached && cached->tree && cached->start == start)
+ node = &cached->rb_node;
+ else
+ node = tree_search(tree, start);
while (node && start <= end) {
state = rb_entry(node, struct extent_state, rb_node);
@@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
bitset = 0;
break;
}
+
+ if (state->end == (u64)-1)
+ break;
+
start = state->end + 1;
if (start > end)
break;
@@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
return 0;
}
@@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree,
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
- if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
return 0;
}
@@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree,
static int check_page_writeback(struct extent_io_tree *tree,
struct page *page)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
- u64 end = start + PAGE_CACHE_SIZE - 1;
- if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
- end_page_writeback(page);
+ end_page_writeback(page);
return 0;
}
@@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
}
if (!uptodate) {
- clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ clear_extent_uptodate(tree, start, end, GFP_NOFS);
ClearPageUptodate(page);
SetPageError(page);
}
- clear_extent_writeback(tree, start, end, GFP_ATOMIC);
-
if (whole_page)
end_page_writeback(page);
else
@@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
continue;
}
/* the get_extent function already copied into the page */
- if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+ if (test_range_bit(tree, cur, cur_end,
+ EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
cur = cur + iosize;
@@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 iosize;
u64 unlock_start;
sector_t sector;
+ struct extent_state *cached_state = NULL;
struct extent_map *em;
struct block_device *bdev;
int ret;
@@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_end = 0;
page_started = 0;
if (!epd->extent_locked) {
+ u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
* to this page.
@@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
tree->ops->fill_delalloc(inode, page, delalloc_start,
delalloc_end, &page_started,
&nr_written);
+ /*
+ * delalloc_end is already one less than the total
+ * length, so we don't subtract one from
+ * PAGE_CACHE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
delalloc_start = delalloc_end + 1;
}
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
/* did the fill delalloc function already unlock and start
* the IO?
@@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done_unlocked;
}
}
- lock_extent(tree, start, page_end, GFP_NOFS);
-
- unlock_start = start;
-
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
if (ret == -EAGAIN) {
- unlock_extent(tree, start, page_end, GFP_NOFS);
redirty_page_for_writepage(wbc, page);
update_nr_written(page, wbc, nr_written);
unlock_page(page);
@@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
update_nr_written(page, wbc, nr_written + 1);
end = page_end;
- if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
- printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
-
if (last_byte <= start) {
- clear_extent_dirty(tree, start, page_end, GFP_NOFS);
- unlock_extent(tree, start, page_end, GFP_NOFS);
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
page_end, NULL, 1);
@@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done;
}
- set_extent_uptodate(tree, start, page_end, GFP_NOFS);
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
if (cur >= last_byte) {
- clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
- unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
page_end, NULL, 1);
@@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
- clear_extent_dirty(tree, cur,
- cur + iosize - 1, GFP_NOFS);
-
- unlock_extent(tree, unlock_start, cur + iosize - 1,
- GFP_NOFS);
-
/*
* end_io notification does not happen here for
* compressed extents
@@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
/* leave this out until we have a page_mkwrite call */
if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
- EXTENT_DIRTY, 0)) {
+ EXTENT_DIRTY, 0, NULL)) {
cur = cur + iosize;
pg_offset += iosize;
continue;
}
- clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
if (tree->ops && tree->ops->writepage_io_hook) {
ret = tree->ops->writepage_io_hook(page, cur,
cur + iosize - 1);
@@ -2309,12 +2415,12 @@ done:
set_page_writeback(page);
end_page_writeback(page);
}
- if (unlock_start <= page_end)
- unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
unlock_page(page);
done_unlocked:
+ /* drop our reference on any cached states */
+ free_extent_state(cached_state);
return 0;
}
@@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
writepage_t writepage, void *data,
void (*flush_fn)(void *))
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
int ret = 0;
int done = 0;
+ int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
@@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
scanned = 1;
}
retry:
- while (!done && (index <= end) &&
+ while (!done && !nr_to_write_done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY, min(end - index,
(pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -2412,12 +2518,15 @@ retry:
unlock_page(page);
ret = 0;
}
- if (ret || wbc->nr_to_write <= 0)
- done = 1;
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
+ if (ret)
done = 1;
- }
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
}
pagevec_release(&pvec);
cond_resched();
@@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
return 0;
lock_extent(tree, start, end, GFP_NOFS);
- wait_on_extent_writeback(tree, start, end);
+ wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
- 1, 1, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
+ 1, 1, NULL, GFP_NOFS);
return 0;
}
@@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
!isnew && !PageUptodate(page) &&
(block_off_end > to || block_off_start < from) &&
!test_range_bit(tree, block_start, cur_end,
- EXTENT_UPTODATE, 1)) {
+ EXTENT_UPTODATE, 1, NULL)) {
u64 sector;
u64 extent_offset = block_start - em->start;
size_t iosize;
@@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
*/
set_extent_bit(tree, block_start,
block_start + iosize - 1,
- EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+ EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
ret = submit_extent_page(READ, tree, page,
sector, iosize, page_offset, em->bdev,
NULL, 1,
@@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map,
int ret = 1;
if (test_range_bit(tree, start, end,
- EXTENT_IOBITS | EXTENT_ORDERED, 0))
+ EXTENT_IOBITS, 0, NULL))
ret = 0;
else {
if ((mask & GFP_NOFS) == GFP_NOFS)
mask = GFP_NOFS;
- clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
- 1, 1, mask);
+ /*
+ * at this point we can safely clear everything except the
+ * locked bit and the nodatasum bit
+ */
+ clear_extent_bit(tree, start, end,
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+ 0, 0, NULL, mask);
}
return ret;
}
@@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 len;
while (start <= end) {
len = end - start + 1;
- spin_lock(&map->lock);
+ write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
if (!em || IS_ERR(em)) {
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
break;
}
if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
em->start != start) {
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
free_extent_map(em);
break;
}
if (!test_range_bit(tree, em->start,
extent_map_end(em) - 1,
- EXTENT_LOCKED | EXTENT_WRITEBACK |
- EXTENT_ORDERED,
- 0)) {
+ EXTENT_LOCKED | EXTENT_WRITEBACK,
+ 0, NULL)) {
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
}
start = extent_map_end(em);
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
@@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int uptodate;
unsigned long index;
- ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+ ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
if (ret)
return 1;
while (start <= end) {
@@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
return 1;
ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1);
+ EXTENT_UPTODATE, 1, NULL);
if (ret)
return ret;
@@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
return 0;
if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1)) {
+ EXTENT_UPTODATE, 1, NULL)) {
return 0;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5bc20abf3f3d..36de250a7b2b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,10 +13,9 @@
#define EXTENT_DEFRAG (1 << 6)
#define EXTENT_DEFRAG_DONE (1 << 7)
#define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_ORDERED (1 << 9)
-#define EXTENT_ORDERED_METADATA (1 << 10)
-#define EXTENT_BOUNDARY (1 << 11)
-#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/* flags for bio submission */
@@ -27,6 +26,16 @@
#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
+/* these are flags for extent_clear_unlock_delalloc */
+#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
+#define EXTENT_CLEAR_UNLOCK 0x2
+#define EXTENT_CLEAR_DELALLOC 0x4
+#define EXTENT_CLEAR_DIRTY 0x8
+#define EXTENT_SET_WRITEBACK 0x10
+#define EXTENT_END_WRITEBACK 0x20
+#define EXTENT_SET_PRIVATE2 0x40
+#define EXTENT_CLEAR_ACCOUNTING 0x80
+
/*
* page->private values. Every page that is controlled by the extent
* map has page->private set to one.
@@ -62,8 +71,13 @@ struct extent_io_ops {
struct extent_state *state, int uptodate);
int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits);
- int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits);
+ int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+ unsigned long bits);
+ int (*merge_extent_hook)(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other);
+ int (*split_extent_hook)(struct inode *inode,
+ struct extent_state *orig, u64 split);
int (*write_cache_pages_lock_hook)(struct page *page);
};
@@ -81,10 +95,14 @@ struct extent_state {
u64 start;
u64 end; /* inclusive */
struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
+ u64 split_start;
+ u64 split_end;
/* for use by the FS */
u64 private;
@@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, struct extent_state **cached, gfp_t mask);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
@@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 max_bytes, unsigned long bits);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int filled);
+ int bits, int filled, struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int wake, int delete, gfp_t mask);
+ int bits, int wake, int delete, struct extent_state **cached,
+ gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int extent_clear_unlock_delalloc(struct inode *inode,
struct extent_io_tree *tree,
u64 start, u64 end, struct page *locked_page,
- int unlock_page,
- int clear_unlock,
- int clear_delalloc, int clear_dirty,
- int set_writeback,
- int end_writeback);
+ unsigned long op);
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 30c9365861e6..2c726b7b9faa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -36,7 +36,7 @@ void extent_map_exit(void)
void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
{
tree->map.rb_node = NULL;
- spin_lock_init(&tree->lock);
+ rwlock_init(&tree->lock);
}
/**
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
return 0;
}
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+ int ret = 0;
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
+ struct extent_map *em;
+
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, start, len);
+
+ WARN_ON(em->start != start || !em);
+
+ if (!em)
+ goto out;
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(merge, em)) {
+ em->start = merge->start;
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ em->block_start = merge->block_start;
+ merge->in_tree = 0;
+ rb_erase(&merge->rb_node, &tree->map);
+ free_extent_map(merge);
+ }
+ }
+
+ rb = rb_next(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(em, merge)) {
+ em->len += merge->len;
+ em->block_len += merge->len;
+ rb_erase(&merge->rb_node, &tree->map);
+ merge->in_tree = 0;
+ free_extent_map(merge);
+ }
+
+ free_extent_map(em);
+out:
+ write_unlock(&tree->lock);
+ return ret;
+
+}
+
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
ret = -EEXIST;
goto out;
}
- assert_spin_locked(&tree->lock);
rb = tree_insert(&tree->map, em->start, &em->rb_node);
if (rb) {
ret = -EEXIST;
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *next = NULL;
u64 end = range_end(start, len);
- assert_spin_locked(&tree->lock);
rb_node = __tree_search(&tree->map, start, &prev, &next);
if (!rb_node && prev) {
em = rb_entry(prev, struct extent_map, rb_node);
@@ -319,6 +367,54 @@ out:
}
/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *next = NULL;
+
+ rb_node = __tree_search(&tree->map, start, &prev, &next);
+ if (!rb_node && prev) {
+ em = rb_entry(prev, struct extent_map, rb_node);
+ goto found;
+ }
+ if (!rb_node && next) {
+ em = rb_entry(next, struct extent_map, rb_node);
+ goto found;
+ }
+ if (!rb_node) {
+ em = NULL;
+ goto out;
+ }
+ if (IS_ERR(rb_node)) {
+ em = ERR_PTR(PTR_ERR(rb_node));
+ goto out;
+ }
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+ goto found;
+
+ em = NULL;
+ goto out;
+
+found:
+ atomic_inc(&em->refs);
+out:
+ return em;
+}
+
+/**
* remove_extent_mapping - removes an extent_map from the extent tree
* @tree: extent tree to remove from
* @em: extent map beeing removed
@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
int ret = 0;
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- assert_spin_locked(&tree->lock);
rb_erase(&em->rb_node, &tree->map);
em->in_tree = 0;
return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index fb6eeef06bb0..ab6d74b6e647 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -31,7 +31,7 @@ struct extent_map {
struct extent_map_tree {
struct rb_root map;
- spinlock_t lock;
+ rwlock_t lock;
};
static inline u64 extent_map_end(struct extent_map *em)
@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b833972273a..06550affbd27 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
int err = 0;
int i;
struct inode *inode = fdentry(file)->d_inode;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- u64 hint_byte;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
end_of_last_block = start_pos + num_bytes - 1;
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ if (err)
+ return err;
- lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
- trans = btrfs_join_transaction(root, 1);
- if (!trans) {
- err = -ENOMEM;
- goto out_unlock;
- }
- btrfs_set_trans_block_group(trans, inode);
- hint_byte = 0;
-
- set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-
- /* check for reserved extents on each page, we don't want
- * to reset the delalloc bit on things that already have
- * extents reserved.
- */
- btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
SetPageUptodate(p);
@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
* at this time.
*/
}
- err = btrfs_end_transaction(trans, root);
-out_unlock:
- unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
return err;
}
@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
if (!split2)
split2 = alloc_extent_map(GFP_NOFS);
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
break;
}
flags = em->flags;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
- spin_unlock(&em_tree->lock);
if (em->start <= start &&
(!testend || em->start + em->len >= start + len)) {
free_extent_map(em);
+ write_unlock(&em_tree->lock);
break;
}
if (start < em->start) {
@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
start = em->start + em->len;
}
free_extent_map(em);
+ write_unlock(&em_tree->lock);
continue;
}
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
free_extent_map(split);
split = NULL;
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
/* once for us */
free_extent_map(em);
@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 locked_end,
- u64 inline_limit, u64 *hint_byte)
+ u64 inline_limit, u64 *hint_byte, int drop_cache)
{
u64 extent_end = 0;
u64 search_start = start;
@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int ret;
inline_limit = 0;
- btrfs_drop_extent_cache(inode, start, end - 1, 0);
+ if (drop_cache)
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
if (!path)
@@ -894,7 +878,8 @@ again:
btrfs_put_ordered_extent(ordered);
clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
- last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
GFP_NOFS);
unlock_extent(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, GFP_NOFS);
@@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
start_pos = pos;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* do the reserve before the mutex lock in case we have to do some
+ * flushing. We wouldn't deadlock, but this is more polite.
+ */
+ err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (err)
+ goto out_nolock;
+
+ mutex_lock(&inode->i_mutex);
+
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
- goto out_nolock;
+ goto out;
+
if (count == 0)
- goto out_nolock;
+ goto out;
err = file_remove_suid(file);
if (err)
- goto out_nolock;
+ goto out;
+
file_update_time(file);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
- mutex_lock(&inode->i_mutex);
+ /* generic_write_checks can change our pos */
+ start_pos = pos;
+
BTRFS_I(inode)->sequence++;
first_index = pos >> PAGE_CACHE_SHIFT;
last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1024,9 +1023,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
}
if (will_write) {
- btrfs_fdatawrite_range(inode->i_mapping, pos,
- pos + write_bytes - 1,
- WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, pos,
+ pos + write_bytes - 1);
} else {
balance_dirty_pages_ratelimited_nr(inode->i_mapping,
num_pages);
@@ -1047,6 +1045,7 @@ out:
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
out_nolock:
kfree(pages);
@@ -1087,8 +1086,10 @@ out_nolock:
btrfs_end_transaction(trans, root);
else
btrfs_commit_transaction(trans, root);
- } else {
+ } else if (ret != BTRFS_NO_LOG_SYNC) {
btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_end_transaction(trans, root);
}
}
if (file->f_flags & O_DIRECT) {
@@ -1138,6 +1139,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
int ret = 0;
struct btrfs_trans_handle *trans;
+
+ /* we wait first, since the writeback may change the inode */
+ root->log_batch++;
+ /* the VFS called filemap_fdatawrite for us */
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ root->log_batch++;
+
/*
* check the transaction that last modified this inode
* and see if its already been committed
@@ -1145,6 +1153,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (!BTRFS_I(inode)->last_trans)
goto out;
+ /*
+ * if the last transaction that changed this file was before
+ * the current transaction, we can bail out now without any
+ * syncing
+ */
mutex_lock(&root->fs_info->trans_mutex);
if (BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
@@ -1154,13 +1167,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
}
mutex_unlock(&root->fs_info->trans_mutex);
- root->log_batch++;
- filemap_fdatawrite(inode->i_mapping);
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- root->log_batch++;
-
- if (datasync && !(inode->i_state & I_DIRTY_PAGES))
- goto out;
/*
* ok we haven't committed the transaction yet, lets do a commit
*/
@@ -1189,21 +1195,25 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
*/
mutex_unlock(&dentry->d_inode->i_mutex);
- if (ret > 0) {
- ret = btrfs_commit_transaction(trans, root);
- } else {
- ret = btrfs_sync_log(trans, root);
- if (ret == 0)
- ret = btrfs_end_transaction(trans, root);
- else
+ if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret > 0) {
ret = btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ } else {
+ ret = btrfs_end_transaction(trans, root);
}
mutex_lock(&dentry->d_inode->i_mutex);
out:
return ret > 0 ? EIO : ret;
}
-static struct vm_operations_struct btrfs_file_vm_ops = {
+static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = btrfs_page_mkwrite,
};
@@ -1215,7 +1225,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
return 0;
}
-struct file_operations btrfs_file_operations = {
+const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5edcee3a617f..5c2caad76212 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
{
- u64 max_bytes, possible_bytes;
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
/*
* The goal is to keep the total amount of memory used per 1gb of space
@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
max_bytes = MAX_CACHE_BYTES_PER_GIG *
(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
- possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
- (sizeof(struct btrfs_free_space) *
- block_group->extents_thresh);
+ /*
+ * we want to account for 1 more bitmap than what we have so we can make
+ * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+ * we add more bitmaps.
+ */
+ bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
- if (possible_bytes > max_bytes) {
- int extent_bytes = max_bytes -
- (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+ if (bitmap_bytes >= max_bytes) {
+ block_group->extents_thresh = 0;
+ return;
+ }
- if (extent_bytes <= 0) {
- block_group->extents_thresh = 0;
- return;
- }
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the maxw
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
- block_group->extents_thresh = extent_bytes /
- (sizeof(struct btrfs_free_space));
- }
+ block_group->extents_thresh =
+ div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
}
static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
BUG_ON(block_group->total_bitmaps >= max_bitmaps);
info->offset = offset_to_bitmap(block_group, offset);
+ info->bytes = 0;
link_free_space(block_group, info);
block_group->total_bitmaps++;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 6b627c611808..72ce3c173d6a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
+ if (ret == -EOVERFLOW)
+ ret = -EMLINK;
goto out;
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_inode_item));
- if (ret == 0 && objectid > root->highest_inode)
- root->highest_inode = objectid;
return ret;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 9abbced1123d..c56eb5909172 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
slot = path->slots[0] - 1;
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
- *objectid = found_key.objectid;
+ *objectid = max_t(u64, found_key.objectid,
+ BTRFS_FIRST_FREE_OBJECTID - 1);
} else {
- *objectid = BTRFS_FIRST_FREE_OBJECTID;
+ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
}
ret = 0;
error:
@@ -53,91 +54,27 @@ error:
return ret;
}
-/*
- * walks the btree of allocated inodes and find a hole.
- */
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 *objectid)
{
- struct btrfs_path *path;
- struct btrfs_key key;
int ret;
- int slot = 0;
- u64 last_ino = 0;
- int start_found;
- struct extent_buffer *l;
- struct btrfs_key search_key;
- u64 search_start = dirid;
-
mutex_lock(&root->objectid_mutex);
- if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
- root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
- *objectid = ++root->last_inode_alloc;
- mutex_unlock(&root->objectid_mutex);
- return 0;
- }
- path = btrfs_alloc_path();
- BUG_ON(!path);
- search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
- search_key.objectid = search_start;
- search_key.type = 0;
- search_key.offset = 0;
-
- start_found = 0;
- ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
- if (ret < 0)
- goto error;
- while (1) {
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto error;
- if (!start_found) {
- *objectid = search_start;
- start_found = 1;
- goto found;
- }
- *objectid = last_ino > search_start ?
- last_ino : search_start;
- goto found;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
- if (key.objectid >= search_start) {
- if (start_found) {
- if (last_ino < search_start)
- last_ino = search_start;
- if (key.objectid > last_ino) {
- *objectid = last_ino;
- goto found;
- }
- } else if (key.objectid > search_start) {
- *objectid = search_start;
- goto found;
- }
- }
- if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
- break;
+ if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+ if (ret)
+ goto out;
+ }
- start_found = 1;
- last_ino = key.objectid + 1;
- path->slots[0]++;
+ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ ret = -ENOSPC;
+ goto out;
}
- BUG_ON(1);
-found:
- btrfs_release_path(root, path);
- btrfs_free_path(path);
- BUG_ON(*objectid < search_start);
- mutex_unlock(&root->objectid_mutex);
- return 0;
-error:
- btrfs_release_path(root, path);
- btrfs_free_path(path);
+
+ *objectid = ++root->highest_objectid;
+ ret = 0;
+out:
mutex_unlock(&root->objectid_mutex);
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9096fd0ca3ca..dae12dc7e159 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
-static struct file_operations btrfs_dir_file_operations;
+static const struct file_operations btrfs_dir_file_operations;
static struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
}
ret = btrfs_drop_extents(trans, root, inode, start,
- aligned_end, aligned_end, start, &hint_byte);
+ aligned_end, aligned_end, start,
+ &hint_byte, 1);
BUG_ON(ret);
if (isize > actual_end)
@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
inline_len, compressed_size,
compressed_pages);
BUG_ON(ret);
- btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
}
@@ -423,9 +424,12 @@ again:
* and free up our temp pages.
*/
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- start, end, NULL, 1, 0,
- 0, 1, 1, 1);
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
ret = 0;
goto free_pages_out;
}
@@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
* clear dirty, set writeback and unlock the pages.
*/
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- NULL, 1, 1, 0, 1, 1, 0);
+ &BTRFS_I(inode)->io_tree,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
ret = btrfs_submit_compressed_write(inode,
async_extent->start,
@@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode,
start, end, 0, NULL);
if (ret == 0) {
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- start, end, NULL, 1, 1,
- 1, 1, 1, 1);
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started = 1;
@@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(disk_num_bytes >
btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+ read_lock(&BTRFS_I(inode)->extent_tree.lock);
+ em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
+ start, num_bytes);
+ if (em) {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ read_unlock(&BTRFS_I(inode)->extent_tree.lock);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
+ unsigned long op;
+
cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
@@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode,
em = alloc_extent_map(GFP_NOFS);
em->start = start;
em->orig_start = em->start;
-
ram_size = ins.offset;
em->len = ins.offset;
@@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode,
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode,
/* we're not doing compressed IO, don't unlock the first
* page (which the caller expects to stay locked), don't
* clear any dirty bits and don't set any writeback bits
+ *
+ * Do set the Private2 bit so we know this page was properly
+ * setup for writepage
*/
+ op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
+ op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_PRIVATE2;
+
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
start, start + ram_size - 1,
- locked_page, unlock, 1,
- 1, 0, 0, 0);
+ locked_page, op);
disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
@@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 cur_end;
int limit = 10 * 1024 * 1042;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
- EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+ 1, 0, NULL, GFP_NOFS);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
async_cow->inode = inode;
@@ -994,6 +1023,7 @@ next_slot:
if (found_key.offset > cur_offset) {
extent_end = found_key.offset;
+ extent_type = 0;
goto out_check;
}
@@ -1080,9 +1110,9 @@ out_check:
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -1100,8 +1130,10 @@ out_check:
BUG_ON(ret);
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
- cur_offset, cur_offset + num_bytes - 1,
- locked_page, 1, 1, 1, 0, 0, 0);
+ cur_offset, cur_offset + num_bytes - 1,
+ locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_PRIVATE2);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
return ret;
}
+static int btrfs_split_extent_hook(struct inode *inode,
+ struct extent_state *orig, u64 split)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 size;
+
+ if (!(orig->state & EXTENT_DELALLOC))
+ return 0;
+
+ size = orig->end - orig->start + 1;
+ if (size > root->fs_info->max_extent) {
+ u64 num_extents;
+ u64 new_size;
+
+ new_size = orig->end - split + 1;
+ num_extents = div64_u64(size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+
+ /*
+ * if we break a large extent up then leave oustanding_extents
+ * be, since we've already accounted for the large extent.
+ */
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) < num_extents)
+ return 0;
+ }
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ return 0;
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static int btrfs_merge_extent_hook(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 new_size, old_size;
+ u64 num_extents;
+
+ /* not delalloc, ignore it */
+ if (!(other->state & EXTENT_DELALLOC))
+ return 0;
+
+ old_size = other->end - other->start + 1;
+ if (new->start < other->start)
+ new_size = other->end - new->start + 1;
+ else
+ new_size = new->end - other->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= root->fs_info->max_extent) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ return 0;
+ }
+
+ /*
+ * If we grew by another max_extent, just return, we want to keep that
+ * reserved amount.
+ */
+ num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) > num_extents)
+ return 0;
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ return 0;
+}
+
/*
* extent_io.c set_bit_hook, used to track delayed allocation
* bytes in this file, and to maintain the list of inodes that
@@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits)
{
+
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
@@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
*/
if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
btrfs_delalloc_reserve_space(root, inode, end - start + 1);
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
/*
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits)
+static int btrfs_clear_bit_hook(struct inode *inode,
+ struct extent_state *state, unsigned long bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ if (bits & EXTENT_DO_ACCOUNTING) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ }
+
spin_lock(&root->fs_info->delalloc_lock);
- if (end - start + 1 > root->fs_info->delalloc_bytes) {
+ if (state->end - state->start + 1 >
+ root->fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs warning: delalloc account "
"%llu %llu\n",
- (unsigned long long)end - start + 1,
+ (unsigned long long)
+ state->end - state->start + 1,
(unsigned long long)
root->fs_info->delalloc_bytes);
btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
BTRFS_I(inode)->delalloc_bytes = 0;
} else {
btrfs_delalloc_free_space(root, inode,
- end - start + 1);
- root->fs_info->delalloc_bytes -= end - start + 1;
- BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+ state->end -
+ state->start + 1);
+ root->fs_info->delalloc_bytes -= state->end -
+ state->start + 1;
+ BTRFS_I(inode)->delalloc_bytes -= state->end -
+ state->start + 1;
}
if (BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -1374,10 +1506,8 @@ again:
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
/* already ordered? We're done */
- if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_ORDERED, 0)) {
+ if (PagePrivate2(page))
goto out;
- }
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
@@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
struct inode *inode = page->mapping->host;
struct btrfs_writepage_fixup *fixup;
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret;
- ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
- EXTENT_ORDERED, 0);
- if (ret)
+ /* this page is properly in the ordered list */
+ if (TestClearPagePrivate2(page))
return 0;
if (PageChecked(page))
@@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(!path);
path->leave_spinning = 1;
+
+ /*
+ * we may be replacing one extent in the tree with another.
+ * The new extent is pinned in the extent map, and we don't want
+ * to drop it from the cache until it is completely in the btree.
+ *
+ * So, tell btrfs_drop_extents to leave this extent in the cache.
+ * the caller is expected to unpin it and allow it to be merged
+ * with the others.
+ */
ret = btrfs_drop_extents(trans, root, inode, file_pos,
file_pos + num_bytes, locked_end,
- file_pos, &hint);
+ file_pos, &hint, 0);
BUG_ON(ret);
ins.objectid = inode->i_ino;
@@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
inode_add_bytes(inode, num_bytes);
- btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
@@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->len,
compressed, 0, 0,
BTRFS_FILE_EXTENT_REG);
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered_extent->file_offset,
+ ordered_extent->len);
BUG_ON(ret);
}
unlock_extent(io_tree, ordered_extent->file_offset,
@@ -1623,6 +1763,7 @@ nocow:
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
+ ClearPagePrivate2(page);
return btrfs_finish_ordered_io(page->mapping->host, start, end);
}
@@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
failrec->last_mirror = 0;
failrec->bio_flags = 0;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
if (em->start > start || em->start + em->len < start) {
free_extent_map(em);
em = NULL;
}
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em || IS_ERR(em)) {
kfree(failrec);
@@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
return 0;
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
GFP_NOFS);
return 0;
@@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
return ret;
}
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ name, name_len, -1);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+ objectid, root->root_key.objectid,
+ dir->i_ino, &index, name, name_len);
+ if (ret < 0) {
+ BUG_ON(ret != -ENOENT);
+ di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+ name, name_len);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(root, path);
+ index = key.offset;
+ }
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ index, name, name_len, -1);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, dir);
+ BUG_ON(ret);
+ dir->i_sb->s_dirt = 1;
+
+ btrfs_free_path(path);
+ return 0;
+}
+
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
@@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
unsigned long nr = 0;
- /*
- * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
- * the root of a subvolume or snapshot
- */
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
- inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
- }
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
+ if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ err = btrfs_unlink_subvol(trans, root, dir,
+ BTRFS_I(inode)->location.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ goto out;
+ }
+
err = btrfs_orphan_add(trans, inode);
if (err)
- goto fail_trans;
+ goto out;
/* now the directory is empty */
err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
if (!err)
btrfs_i_size_write(inode, 0);
-
-fail_trans:
+out:
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
@@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
if ((offset & (blocksize - 1)) == 0)
goto out;
+ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret)
+ goto out;
ret = -ENOMEM;
again:
page = grab_cache_page(mapping, index);
- if (!page)
+ if (!page) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
goto out;
+ }
page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -2864,7 +3080,16 @@ again:
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ goto out_unlock;
+ }
+
ret = 0;
if (offset != PAGE_CACHE_SIZE) {
kaddr = kmap(page);
@@ -2877,6 +3102,9 @@ again:
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
+ if (ret)
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
unlock_page(page);
page_cache_release(page);
out:
@@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err;
+ int err = 0;
if (size <= hole_start)
return 0;
- err = btrfs_check_metadata_free_space(root);
+ err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
if (err)
return err;
- btrfs_truncate_page(inode->i_mapping, inode->i_size);
-
while (1) {
struct btrfs_ordered_extent *ordered;
btrfs_wait_ordered_range(inode, hole_start,
@@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
cur_offset,
cur_offset + hole_size,
block_end,
- cur_offset, &hint_byte);
+ cur_offset, &hint_byte, 1);
+ if (err)
+ break;
+
+ err = btrfs_reserve_metadata_space(root, 1);
if (err)
break;
+
err = btrfs_insert_file_extent(trans, root,
inode->i_ino, cur_offset, 0,
0, hole_size, 0, hole_size,
0, 0, 0);
btrfs_drop_extent_cache(inode, hole_start,
last_byte - 1, 0);
+ btrfs_unreserve_metadata_space(root, 1);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode)
}
btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (inode->i_nlink > 0) {
+ BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+ goto no_delete;
+ }
+
btrfs_i_size_write(inode, 0);
trans = btrfs_join_transaction(root, 1);
@@ -3070,29 +3307,67 @@ out_err:
* is kind of like crossing a mount point.
*/
static int fixup_tree_root_location(struct btrfs_root *root,
- struct btrfs_key *location,
- struct btrfs_root **sub_root,
- struct dentry *dentry)
+ struct inode *dir,
+ struct dentry *dentry,
+ struct btrfs_key *location,
+ struct btrfs_root **sub_root)
{
- struct btrfs_root_item *ri;
+ struct btrfs_path *path;
+ struct btrfs_root *new_root;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ int ret;
+ int err = 0;
- if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
- return 0;
- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
- return 0;
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
- *sub_root = btrfs_read_fs_root(root->fs_info, location,
- dentry->d_name.name,
- dentry->d_name.len);
- if (IS_ERR(*sub_root))
- return PTR_ERR(*sub_root);
+ err = -ENOENT;
+ ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+ BTRFS_I(dir)->root->root_key.objectid,
+ location->objectid);
+ if (ret) {
+ if (ret < 0)
+ err = ret;
+ goto out;
+ }
- ri = &(*sub_root)->root_item;
- location->objectid = btrfs_root_dirid(ri);
- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
- location->offset = 0;
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+ btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+ goto out;
- return 0;
+ ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+ (unsigned long)(ref + 1),
+ dentry->d_name.len);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(root->fs_info->tree_root, path);
+
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+ if (IS_ERR(new_root)) {
+ err = PTR_ERR(new_root);
+ goto out;
+ }
+
+ if (btrfs_root_refs(&new_root->root_item) == 0) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ *sub_root = new_root;
+ location->objectid = btrfs_root_dirid(&new_root->root_item);
+ location->type = BTRFS_INODE_ITEM_KEY;
+ location->offset = 0;
+ err = 0;
+out:
+ btrfs_free_path(path);
+ return err;
}
static void inode_tree_add(struct inode *inode)
@@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode)
struct btrfs_inode *entry;
struct rb_node **p;
struct rb_node *parent;
-
again:
p = &root->inode_tree.rb_node;
parent = NULL;
+ if (hlist_unhashed(&inode->i_hash))
+ return;
+
spin_lock(&root->inode_lock);
while (*p) {
parent = *p;
@@ -3132,13 +3409,87 @@ again:
static void inode_tree_del(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ int empty = 0;
spin_lock(&root->inode_lock);
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
+
+ if (empty && btrfs_root_refs(&root->root_item) == 0) {
+ synchronize_srcu(&root->fs_info->subvol_srcu);
+ spin_lock(&root->inode_lock);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ spin_unlock(&root->inode_lock);
+ if (empty)
+ btrfs_add_dead_root(root);
+ }
+}
+
+int btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+ u64 objectid = 0;
+
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < entry->vfs_inode.i_ino)
+ node = node->rb_left;
+ else if (objectid > entry->vfs_inode.i_ino)
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= entry->vfs_inode.i_ino) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ objectid = entry->vfs_inode.i_ino + 1;
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ /*
+ * btrfs_drop_inode will remove it from
+ * the inode cache when its usage count
+ * hits zero.
+ */
+ iput(inode);
+ cond_resched();
+ spin_lock(&root->inode_lock);
+ goto again;
+ }
+
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+ return 0;
}
static noinline void init_btrfs_i(struct inode *inode)
@@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode)
bi->generation = 0;
bi->sequence = 0;
bi->last_trans = 0;
+ bi->last_sub_trans = 0;
bi->logged_trans = 0;
bi->delalloc_bytes = 0;
bi->reserved_bytes = 0;
@@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
return inode;
}
+static struct inode *new_simple_dir(struct super_block *s,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
+{
+ struct inode *inode = new_inode(s);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ init_btrfs_i(inode);
+
+ BTRFS_I(inode)->root = root;
+ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ BTRFS_I(inode)->dummy_inode = 1;
+
+ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ return inode;
+}
+
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct inode *inode;
- struct btrfs_inode *bi = BTRFS_I(dir);
- struct btrfs_root *root = bi->root;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location;
+ int index;
int ret;
+ dentry->d_op = &btrfs_dentry_operations;
+
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret < 0)
return ERR_PTR(ret);
- inode = NULL;
- if (location.objectid) {
- ret = fixup_tree_root_location(root, &location, &sub_root,
- dentry);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return ERR_PTR(-ENOENT);
+ if (location.objectid == 0)
+ return NULL;
+
+ if (location.type == BTRFS_INODE_ITEM_KEY) {
+ inode = btrfs_iget(dir->i_sb, &location, root);
+ return inode;
+ }
+
+ BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+ index = srcu_read_lock(&root->fs_info->subvol_srcu);
+ ret = fixup_tree_root_location(root, dir, dentry,
+ &location, &sub_root);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ inode = ERR_PTR(ret);
+ else
+ inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ } else {
inode = btrfs_iget(dir->i_sb, &location, sub_root);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
}
+ srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
return inode;
}
+static int btrfs_dentry_delete(struct dentry *dentry)
+{
+ struct btrfs_root *root;
+
+ if (!dentry->d_inode && !IS_ROOT(dentry))
+ dentry = dentry->d_parent;
+
+ if (dentry->d_inode) {
+ root = BTRFS_I(dentry->d_inode)->root;
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+ }
+ return 0;
+}
+
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct inode *inode;
- if (dentry->d_name.len > BTRFS_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
inode = btrfs_lookup_dentry(dir, dentry);
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (ret != 0)
goto fail;
- if (objectid > root->highest_inode)
- root->highest_inode = objectid;
-
inode->i_uid = current_fsuid();
if (dir && (dir->i_mode & S_ISGID)) {
@@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index)
{
- int ret;
+ int ret = 0;
struct btrfs_key key;
struct btrfs_root *root = BTRFS_I(parent_inode)->root;
- key.objectid = inode->i_ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
- key.offset = 0;
+ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+ } else {
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+ }
+
+ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ key.objectid, root->root_key.objectid,
+ parent_inode->i_ino,
+ index, name, name_len);
+ } else if (add_backref) {
+ ret = btrfs_insert_inode_ref(trans, root,
+ name, name_len, inode->i_ino,
+ parent_inode->i_ino, index);
+ }
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode->i_ino,
- &key, btrfs_inode_type(inode),
- index);
if (ret == 0) {
- if (add_backref) {
- ret = btrfs_insert_inode_ref(trans, root,
- name, name_len,
- inode->i_ino,
- parent_inode->i_ino,
- index);
- }
+ ret = btrfs_insert_dir_item(trans, root, name, name_len,
+ parent_inode->i_ino, &key,
+ btrfs_inode_type(inode), index);
+ BUG_ON(ret);
+
btrfs_i_size_write(parent_inode, parent_inode->i_size +
name_len * 2);
parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto fail;
+ return err;
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3774,6 +4188,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto fail;
+ return err;
+
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3838,6 +4261,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink == 0)
return -ENOENT;
- btrfs_inc_nlink(inode);
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 item for inode ref
+ * 2 items for dir items
+ */
+ err = btrfs_reserve_metadata_space(root, 3);
if (err)
- goto fail;
+ return err;
+
+ btrfs_inc_nlink(inode);
+
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
@@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
err = btrfs_add_nondir(trans, dentry, inode, 1, index);
- if (err)
- drop_inode = 1;
-
- btrfs_update_inode_block_group(trans, dir);
- err = btrfs_update_inode(trans, root, inode);
-
- if (err)
+ if (err) {
drop_inode = 1;
+ } else {
+ btrfs_update_inode_block_group(trans, dir);
+ err = btrfs_update_inode(trans, root, inode);
+ BUG_ON(err);
+ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ }
nr = trans->blocks_used;
-
- btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 3);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for inode and ref
+ * 2 items for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto out_unlock;
+ return err;
trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, dir);
-
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ if (!trans) {
+ err = -ENOMEM;
goto out_unlock;
}
+ btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
if (err) {
@@ -3967,6 +4400,7 @@ out_fail:
btrfs_end_transaction_throttle(trans, root);
out_unlock:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
@@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
int compressed;
again:
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em)
em->bdev = root->fs_info->fs_devices->latest_bdev;
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
@@ -4215,6 +4649,11 @@ again:
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
+ if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+ memset(map + pg_offset + copy_size, 0,
+ PAGE_CACHE_SIZE - pg_offset -
+ copy_size);
+ }
kunmap(page);
}
flush_dcache_page(page);
@@ -4259,7 +4698,7 @@ insert:
}
err = 0;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
@@ -4299,7 +4738,7 @@ insert:
err = 0;
}
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
out:
if (path)
btrfs_free_path(path);
@@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ /*
+ * we have the page locked, so new writeback can't start,
+ * and the dirty bit won't be cleared while we are here.
+ *
+ * Wait for IO on this page so that we can safely clear
+ * the PagePrivate2 bit and do ordered accounting
+ */
wait_on_page_writeback(page);
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
-
lock_extent(tree, page_start, page_end, GFP_NOFS);
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
page_offset(page));
@@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_LOCKED, 1, 0, GFP_NOFS);
- btrfs_finish_ordered_io(page->mapping->host,
- page_start, page_end);
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+ NULL, GFP_NOFS);
+ /*
+ * whoever cleared the private bit is responsible
+ * for the finish_ordered_io
+ */
+ if (TestClearPagePrivate2(page)) {
+ btrfs_finish_ordered_io(page->mapping->host,
+ page_start, page_end);
+ }
btrfs_put_ordered_extent(ordered);
lock_extent(tree, page_start, page_end, GFP_NOFS);
}
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_ORDERED,
- 1, 1, GFP_NOFS);
+ EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
@@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
@@ -4504,7 +4964,24 @@ again:
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ /*
+ * XXX - page_mkwrite gets called every time the page is dirtied, even
+ * if it was already dirty, so for space accounting reasons we need to
+ * clear any delalloc bits for the range we are fixing to save. There
+ * is probably a better way to do this, but for now keep consistent with
+ * prepare_pages in the normal write path.
+ */
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ ret = VM_FAULT_SIGBUS;
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ goto out_unlock;
+ }
ret = 0;
/* page is wholly or partially inside EOF */
@@ -4521,11 +4998,17 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
+ SetPageUptodate(page);
+
+ BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ if (!ret)
+ return VM_FAULT_LOCKED;
unlock_page(page);
out:
return ret;
@@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode)
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
- btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ if (ret)
+ return;
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
trans = btrfs_start_transaction(root, 1);
@@ -4594,11 +5079,11 @@ out:
* create a new subvolume directory/inode (helper for the ioctl).
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root, struct dentry *dentry,
+ struct btrfs_root *new_root,
u64 new_dirid, u64 alloc_hint)
{
struct inode *inode;
- int error;
+ int err;
u64 index = 0;
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
inode->i_nlink = 1;
btrfs_i_size_write(inode, 0);
- error = btrfs_update_inode(trans, new_root, inode);
- if (error)
- return error;
+ err = btrfs_update_inode(trans, new_root, inode);
+ BUG_ON(err);
- d_instantiate(dentry, inode);
+ iput(inode);
return 0;
}
@@ -4640,7 +5124,11 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
ei->last_trans = 0;
+ ei->last_sub_trans = 0;
ei->logged_trans = 0;
+ ei->outstanding_extents = 0;
+ ei->reserved_extents = 0;
+ spin_lock_init(&ei->accounting_lock);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->ordered_operations);
@@ -4693,6 +5181,16 @@ void btrfs_destroy_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
+void btrfs_drop_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
+ generic_delete_inode(inode);
+ else
+ generic_drop_inode(inode);
+}
+
static void init_once(void *foo)
{
struct btrfs_inode *ei = (struct btrfs_inode *) foo;
@@ -4761,31 +5259,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec ctime = CURRENT_TIME;
u64 index = 0;
+ u64 root_objectid;
int ret;
- /* we're not allowed to rename between subvolumes */
- if (BTRFS_I(old_inode)->root->root_key.objectid !=
- BTRFS_I(new_dir)->root->root_key.objectid)
+ if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return -EPERM;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
+ if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+ (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
+ return -ENOTEMPTY;
+
if (S_ISDIR(old_inode->i_mode) && new_inode &&
- new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- }
- /* to rename a snapshot or subvolume, we need to juggle the
- * backrefs. This isn't coded yet
+ /*
+ * 2 items for dir items
+ * 1 item for orphan entry
+ * 1 item for ref
*/
- if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
- return -EXDEV;
-
- ret = btrfs_check_metadata_free_space(root);
+ ret = btrfs_reserve_metadata_space(root, 4);
if (ret)
- goto out_unlock;
+ return ret;
/*
* we're using rename to replace one file with another.
@@ -4796,8 +5300,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(old_inode->i_mapping);
+ /* close the racy window with snapshot create/destroy ioctl */
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&root->fs_info->subvol_sem);
+
trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, new_dir);
+
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+ ret = btrfs_set_inode_index(new_dir, &index);
+ if (ret)
+ goto out_fail;
+
+ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ /* force full log commit if subvolume involved. */
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ } else {
+ ret = btrfs_insert_inode_ref(trans, dest,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ old_inode->i_ino,
+ new_dir->i_ino, index);
+ if (ret)
+ goto out_fail;
+ /*
+ * this is an ugly little race, but the rename is required
+ * to make sure that if we crash, the inode is either at the
+ * old name or the new one. pinning the log transaction lets
+ * us make sure we don't allow a log commit to come in after
+ * we unlink the name but before we add the new name back in.
+ */
+ btrfs_pin_log_trans(root);
+ }
/*
* make sure the inode gets flushed if it is replacing
* something.
@@ -4807,18 +5343,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_add_ordered_operation(trans, root, old_inode);
}
- /*
- * this is an ugly little race, but the rename is required to make
- * sure that if we crash, the inode is either at the old name
- * or the new one. pinning the log transaction lets us make sure
- * we don't allow a log commit to come in after we unlink the
- * name but before we add the new name back in.
- */
- btrfs_pin_log_trans(root);
-
- btrfs_set_trans_block_group(trans, new_dir);
-
- btrfs_inc_nlink(old_dentry->d_inode);
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
@@ -4826,47 +5350,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
- ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
- if (ret)
- goto out_fail;
+ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ } else {
+ btrfs_inc_nlink(old_dentry->d_inode);
+ ret = btrfs_unlink_inode(trans, root, old_dir,
+ old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ }
+ BUG_ON(ret);
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME;
- ret = btrfs_unlink_inode(trans, root, new_dir,
- new_dentry->d_inode,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
- if (ret)
- goto out_fail;
+ if (unlikely(new_inode->i_ino ==
+ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ root_objectid = BTRFS_I(new_inode)->location.objectid;
+ ret = btrfs_unlink_subvol(trans, dest, new_dir,
+ root_objectid,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ BUG_ON(new_inode->i_nlink == 0);
+ } else {
+ ret = btrfs_unlink_inode(trans, dest, new_dir,
+ new_dentry->d_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ }
+ BUG_ON(ret);
if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, new_dentry->d_inode);
- if (ret)
- goto out_fail;
+ BUG_ON(ret);
}
-
}
- ret = btrfs_set_inode_index(new_dir, &index);
- if (ret)
- goto out_fail;
- ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
- old_inode, new_dentry->d_name.name,
- new_dentry->d_name.len, 1, index);
- if (ret)
- goto out_fail;
+ ret = btrfs_add_link(trans, new_dir, old_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, 0, index);
+ BUG_ON(ret);
- btrfs_log_new_name(trans, old_inode, old_dir,
- new_dentry->d_parent);
+ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_log_new_name(trans, old_inode, old_dir,
+ new_dentry->d_parent);
+ btrfs_end_log_trans(root);
+ }
out_fail:
-
- /* this btrfs_end_log_trans just allows the current
- * log-sub transaction to complete
- */
- btrfs_end_log_trans(root);
btrfs_end_transaction_throttle(trans, root);
-out_unlock:
+
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&root->fs_info->subvol_sem);
+
+ btrfs_unreserve_metadata_space(root, 4);
return ret;
}
@@ -4938,11 +5475,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for inode item and ref
+ * 2 items for dir items
+ * 1 item for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto out_fail;
+ return err;
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto out_fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5023,6 +5567,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
out_fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -5044,6 +5589,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
while (num_bytes > 0) {
alloc_size = min(num_bytes, root->fs_info->max_extent);
+
+ ret = btrfs_reserve_metadata_space(root, 1);
+ if (ret)
+ goto out;
+
ret = btrfs_reserve_extent(trans, root, alloc_size,
root->sectorsize, 0, alloc_hint,
(u64)-1, &ins, 1);
@@ -5058,9 +5608,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
BUG_ON(ret);
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + ins.offset -1, 0);
num_bytes -= ins.offset;
cur_offset += ins.offset;
alloc_hint = ins.objectid + ins.offset;
+ btrfs_unreserve_metadata_space(root, 1);
}
out:
if (cur_offset > start) {
@@ -5223,7 +5776,8 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
};
-static struct file_operations btrfs_dir_file_operations = {
+
+static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = btrfs_real_readdir,
@@ -5245,6 +5799,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
.readpage_io_failed_hook = btrfs_io_failed_hook,
.set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,
+ .merge_extent_hook = btrfs_merge_extent_hook,
+ .split_extent_hook = btrfs_split_extent_hook,
};
/*
@@ -5269,6 +5825,7 @@ static const struct address_space_operations btrfs_aops = {
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
.set_page_dirty = btrfs_set_page_dirty,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations btrfs_symlink_aops = {
@@ -5309,3 +5866,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
};
+
+const struct dentry_operations btrfs_dentry_operations = {
+ .d_delete = btrfs_dentry_delete,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd88f25889f7..cdbb054102b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
struct btrfs_root_item root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- struct btrfs_root *new_root = root;
- struct inode *dir;
+ struct btrfs_root *new_root;
+ struct inode *dir = dentry->d_parent->d_inode;
int ret;
int err;
u64 objectid;
@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 index = 0;
unsigned long nr = 1;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
- goto fail_commit;
+ return ret;
trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans);
@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
if (ret)
goto fail;
+ key.offset = (u64)-1;
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ BUG_ON(IS_ERR(new_root));
+
+ btrfs_record_root_in_trans(trans, new_root);
+
+ ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+ BTRFS_I(dir)->block_group);
/*
* insert the directory item
*/
- key.offset = (u64)-1;
- dir = dentry->d_parent->d_inode;
ret = btrfs_set_inode_index(dir, &index);
BUG_ON(ret);
@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
ret = btrfs_update_inode(trans, root, dir);
BUG_ON(ret);
- /* add the backref first */
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
- objectid, BTRFS_ROOT_BACKREF_KEY,
- root->root_key.objectid,
+ objectid, root->root_key.objectid,
dir->i_ino, index, name, namelen);
BUG_ON(ret);
- /* now add the forward ref */
- ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
- root->root_key.objectid, BTRFS_ROOT_REF_KEY,
- objectid,
- dir->i_ino, index, name, namelen);
-
- BUG_ON(ret);
-
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- goto fail_commit;
-
- new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
- BUG_ON(!new_root);
-
- trans = btrfs_start_transaction(new_root, 1);
- BUG_ON(!trans);
-
- ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
- BTRFS_I(dir)->block_group);
- if (ret)
- goto fail;
-
+ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
nr = trans->blocks_used;
- err = btrfs_commit_transaction(trans, new_root);
+ err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
-fail_commit:
+
+ btrfs_unreserve_metadata_space(root, 6);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
goto fail_unlock;
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot) {
ret = -ENOMEM;
+ btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
if (!pending_snapshot->name) {
ret = -ENOMEM;
kfree(pending_snapshot);
+ btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
memcpy(pending_snapshot->name, name, namelen);
@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
-static noinline int btrfs_mksubvol(struct path *parent, char *name,
- int mode, int namelen,
+static noinline int btrfs_mksubvol(struct path *parent,
+ char *name, int namelen,
struct btrfs_root *snap_src)
{
+ struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
int error;
- mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(name, parent->dentry, namelen);
error = PTR_ERR(dentry);
@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
if (dentry->d_inode)
goto out_dput;
- if (!IS_POSIXACL(parent->dentry->d_inode))
- mode &= ~current_umask();
-
error = mnt_want_write(parent->mnt);
if (error)
goto out_dput;
- error = btrfs_may_create(parent->dentry->d_inode, dentry);
+ error = btrfs_may_create(dir, dentry);
if (error)
goto out_drop_write;
- /*
- * Actually perform the low-level subvolume creation after all
- * this VFS fuzz.
- *
- * Eventually we want to pass in an inode under which we create this
- * subvolume, but for now all are under the filesystem root.
- *
- * Also we should pass on the mode eventually to allow creating new
- * subvolume with specific mode bits.
- */
+ down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+ goto out_up_read;
+
if (snap_src) {
- struct dentry *dir = dentry->d_parent;
- struct dentry *test = dir->d_parent;
- struct btrfs_path *path = btrfs_alloc_path();
- int ret;
- u64 test_oid;
- u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
-
- test_oid = snap_src->root_key.objectid;
-
- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
- path, parent_oid, test_oid);
- if (ret == 0)
- goto create;
- btrfs_release_path(snap_src->fs_info->tree_root, path);
-
- /* we need to make sure we aren't creating a directory loop
- * by taking a snapshot of something that has our current
- * subvol in its directory tree. So, this loops through
- * the dentries and checks the forward refs for each subvolume
- * to see if is references the subvolume where we are
- * placing this new snapshot.
- */
- while (1) {
- if (!test ||
- dir == snap_src->fs_info->sb->s_root ||
- test == snap_src->fs_info->sb->s_root ||
- test->d_inode->i_sb != snap_src->fs_info->sb) {
- break;
- }
- if (S_ISLNK(test->d_inode->i_mode)) {
- printk(KERN_INFO "Btrfs symlink in snapshot "
- "path, failed\n");
- error = -EMLINK;
- btrfs_free_path(path);
- goto out_drop_write;
- }
- test_oid =
- BTRFS_I(test->d_inode)->root->root_key.objectid;
- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
- path, test_oid, parent_oid);
- if (ret == 0) {
- printk(KERN_INFO "Btrfs snapshot creation "
- "failed, looping\n");
- error = -EMLINK;
- btrfs_free_path(path);
- goto out_drop_write;
- }
- btrfs_release_path(snap_src->fs_info->tree_root, path);
- test = test->d_parent;
- }
-create:
- btrfs_free_path(path);
- error = create_snapshot(snap_src, dentry, name, namelen);
+ error = create_snapshot(snap_src, dentry,
+ name, namelen);
} else {
- error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
- dentry, name, namelen);
+ error = create_subvol(BTRFS_I(dir)->root, dentry,
+ name, namelen);
}
- if (error)
- goto out_drop_write;
-
- fsnotify_mkdir(parent->dentry->d_inode, dentry);
+ if (!error)
+ fsnotify_mkdir(dir, dentry);
+out_up_read:
+ up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
out_drop_write:
mnt_drop_write(parent->mnt);
out_dput:
dput(dentry);
out_unlock:
- mutex_unlock(&parent->dentry->d_inode->i_mutex);
+ mutex_unlock(&dir->i_mutex);
return error;
}
-
static int btrfs_defrag_file(struct file *file)
{
struct inode *inode = fdentry(file)->d_inode;
@@ -596,9 +534,8 @@ again:
clear_page_dirty_for_io(page);
btrfs_set_extent_delalloc(inode, page_start, page_end);
-
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
set_page_dirty(page);
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
@@ -609,7 +546,8 @@ out_unlock:
return 0;
}
-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+ void __user *arg)
{
u64 new_size;
u64 old_size;
@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
{
struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
struct file *src_file;
- u64 root_dirid;
int namelen;
int ret = 0;
@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
goto out;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
- di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
- path, root_dirid,
- vol_args->name, namelen, 0);
- btrfs_free_path(path);
-
- if (di && !IS_ERR(di)) {
- ret = -EEXIST;
- goto out;
- }
-
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
-
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, vol_args->name,
- file->f_path.dentry->d_inode->i_mode,
- namelen, NULL);
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+ NULL);
} else {
struct inode *src_inode;
src_file = fget(vol_args->fd);
@@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
fput(src_file);
goto out;
}
- ret = btrfs_mksubvol(&file->f_path, vol_args->name,
- file->f_path.dentry->d_inode->i_mode,
- namelen, BTRFS_I(src_inode)->root);
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+ BTRFS_I(src_inode)->root);
fput(src_file);
}
-
out:
kfree(vol_args);
return ret;
}
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == root->root_key.objectid &&
+ key.type == BTRFS_ROOT_REF_KEY)
+ ret = -ENOTEMPTY;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+ void __user *arg)
+{
+ struct dentry *parent = fdentry(file);
+ struct dentry *dentry;
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *dest = NULL;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ int namelen;
+ int ret;
+ int err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+ if (strchr(vol_args->name, '/') ||
+ strncmp(vol_args->name, "..", namelen) == 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
+ goto out;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ dentry = lookup_one_len(vol_args->name, parent, namelen);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_unlock_dir;
+ }
+
+ if (!dentry->d_inode) {
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ inode = dentry->d_inode;
+ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out_dput;
+ }
+
+ dest = BTRFS_I(inode)->root;
+
+ mutex_lock(&inode->i_mutex);
+ err = d_invalidate(dentry);
+ if (err)
+ goto out_unlock;
+
+ down_write(&root->fs_info->subvol_sem);
+
+ err = may_destroy_subvol(dest);
+ if (err)
+ goto out_up_write;
+
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_unlink_subvol(trans, root, dir,
+ dest->root_key.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ BUG_ON(ret);
+
+ btrfs_record_root_in_trans(trans, dest);
+
+ memset(&dest->root_item.drop_progress, 0,
+ sizeof(dest->root_item.drop_progress));
+ dest->root_item.drop_level = 0;
+ btrfs_set_root_refs(&dest->root_item, 0);
+
+ ret = btrfs_insert_orphan_item(trans,
+ root->fs_info->tree_root,
+ dest->root_key.objectid);
+ BUG_ON(ret);
+
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ inode->i_flags |= S_DEAD;
+out_up_write:
+ up_write(&root->fs_info->subvol_sem);
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ if (!err) {
+ shrink_dcache_sb(root->fs_info->sb);
+ btrfs_invalidate_inodes(dest);
+ d_delete(dentry);
+ }
+out_dput:
+ dput(dentry);
+out_unlock_dir:
+ mutex_unlock(&dir->i_mutex);
+ mnt_drop_write(file->f_path.mnt);
+out:
+ kfree(vol_args);
+ return err;
+}
+
static int btrfs_ioctl_defrag(struct file *file)
{
struct inode *inode = fdentry(file)->d_inode;
@@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
return ret;
}
-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
/* punch hole in destination first */
btrfs_drop_extents(trans, root, inode, off, off + len,
- off + len, 0, &hint_byte);
+ off + len, 0, &hint_byte, 1);
/* clone data */
key.objectid = src->i_ino;
@@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
datao += off - key.offset;
datal -= off - key.offset;
}
- if (key.offset + datao + datal + key.offset >
- off + len)
- datal = off + len - key.offset - datao;
+
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
/* disko == 0 means it's a hole */
if (!disko)
datao = 0;
@@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- int ret = 0;
+ int ret;
+ ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ goto out;
- if (file->private_data) {
- ret = -EINPROGRESS;
+ ret = -EINPROGRESS;
+ if (file->private_data)
goto out;
- }
ret = mnt_want_write(file->f_path.mnt);
if (ret)
@@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
root->fs_info->open_ioctl_trans++;
mutex_unlock(&root->fs_info->trans_mutex);
+ ret = -ENOMEM;
trans = btrfs_start_ioctl_transaction(root, 0);
- if (trans)
- file->private_data = trans;
- else
- ret = -ENOMEM;
- /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+ if (!trans)
+ goto out_drop;
+
+ file->private_data = trans;
+ return 0;
+
+out_drop:
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans--;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ mnt_drop_write(file->f_path.mnt);
out:
return ret;
}
@@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- int ret = 0;
trans = file->private_data;
- if (!trans) {
- ret = -EINVAL;
- goto out;
- }
- btrfs_end_transaction(trans, root);
+ if (!trans)
+ return -EINVAL;
file->private_data = NULL;
+ btrfs_end_transaction(trans, root);
+
mutex_lock(&root->fs_info->trans_mutex);
root->fs_info->open_ioctl_trans--;
mutex_unlock(&root->fs_info->trans_mutex);
mnt_drop_write(file->f_path.mnt);
-
-out:
- return ret;
+ return 0;
}
long btrfs_ioctl(struct file *file, unsigned int
@@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SUBVOL_CREATE:
return btrfs_ioctl_snap_create(file, argp, 1);
+ case BTRFS_IOC_SNAP_DESTROY:
+ return btrfs_ioctl_snap_destroy(file, argp);
case BTRFS_IOC_DEFRAG:
return btrfs_ioctl_defrag(file);
case BTRFS_IOC_RESIZE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index b320b103fa13..bc49914475eb 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
struct btrfs_ioctl_vol_args)
-
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+ struct btrfs_ioctl_vol_args)
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7b2f401e604e..5799bc46a309 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
*
* len is the length of the extent
*
- * This also sets the EXTENT_ORDERED bit on the range in the inode.
- *
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->start = start;
entry->len = len;
entry->disk_len = disk_len;
+ entry->bytes_left = len;
entry->inode = inode;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
&entry->rb_node);
BUG_ON(node);
- set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
- entry_end(entry) - 1, GFP_NOFS);
-
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
&BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
- clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
- GFP_NOFS);
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
goto out;
}
- ret = test_range_bit(io_tree, entry->file_offset,
- entry->file_offset + entry->len - 1,
- EXTENT_ORDERED, 0);
- if (ret == 0)
+ if (io_size > entry->bytes_left) {
+ printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+ (unsigned long long)entry->bytes_left,
+ (unsigned long long)io_size);
+ }
+ entry->bytes_left -= io_size;
+ if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ else
+ ret = 1;
out:
mutex_unlock(&tree->mutex);
return ret == 0;
@@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
+ inode, 1);
+
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -460,7 +464,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
* start IO on any dirty ones so the wait doesn't stall waiting
* for pdflush to find them
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
u64 orig_end;
u64 wait_end;
struct btrfs_ordered_extent *ordered;
+ int found;
if (start + len < start) {
orig_end = INT_LIMIT(loff_t);
@@ -489,19 +494,18 @@ again:
/* start IO across the range first to instantiate any delalloc
* extents
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
/* The compression code will leave pages locked but return from
* writepage without setting the page writeback. Starting again
* with WB_SYNC_ALL will end up waiting for the IO to actually start.
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
- btrfs_wait_on_page_writeback_range(inode->i_mapping,
- start >> PAGE_CACHE_SHIFT,
- orig_end >> PAGE_CACHE_SHIFT);
+ filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
+ found = 0;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
@@ -514,6 +518,7 @@ again:
btrfs_put_ordered_extent(ordered);
break;
}
+ found++;
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
btrfs_put_ordered_extent(ordered);
@@ -521,8 +526,8 @@ again:
break;
end--;
}
- if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
- EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+ if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+ EXTENT_DELALLOC, 0, NULL)) {
schedule_timeout(1);
goto again;
}
@@ -613,7 +618,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
*/
if (test_range_bit(io_tree, disk_i_size,
ordered->file_offset + ordered->len - 1,
- EXTENT_DELALLOC, 0)) {
+ EXTENT_DELALLOC, 0, NULL)) {
goto out;
}
/*
@@ -664,7 +669,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
*/
if (i_size_test > entry_end(ordered) &&
!test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
- EXTENT_DELALLOC, 0)) {
+ EXTENT_DELALLOC, 0, NULL)) {
new_i_size = min_t(u64, i_size_test, i_size_read(inode));
}
BTRFS_I(inode)->disk_i_size = new_i_size;
@@ -715,89 +720,6 @@ out:
}
-/**
- * taken from mm/filemap.c because it isn't exported
- *
- * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
- * @mapping: address space structure to write
- * @start: offset in bytes where the range starts
- * @end: offset in bytes where the range ends (inclusive)
- * @sync_mode: enable synchronous operation
- *
- * Start writeback against all of a mapping's dirty pages that lie
- * within the byte offsets <start, end> inclusive.
- *
- * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory cleansing writeback. The difference between
- * these two operations is that if a dirty page/buffer is encountered, it must
- * be waited upon, and not just skipped over.
- */
-int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end, int sync_mode)
-{
- struct writeback_control wbc = {
- .sync_mode = sync_mode,
- .nr_to_write = mapping->nrpages * 2,
- .range_start = start,
- .range_end = end,
- };
- return btrfs_writepages(mapping, &wbc);
-}
-
-/**
- * taken from mm/filemap.c because it isn't exported
- *
- * wait_on_page_writeback_range - wait for writeback to complete
- * @mapping: target address_space
- * @start: beginning page index
- * @end: ending page index
- *
- * Wait for writeback to complete against pages indexed by start->end
- * inclusive
- */
-int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
-{
- struct pagevec pvec;
- int nr_pages;
- int ret = 0;
- pgoff_t index;
-
- if (end < start)
- return 0;
-
- pagevec_init(&pvec, 0);
- index = start;
- while ((index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
- unsigned i;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
-
- /* until radix tree lookup accepts end_index */
- if (page->index > end)
- continue;
-
- wait_on_page_writeback(page);
- if (PageError(page))
- ret = -EIO;
- }
- pagevec_release(&pvec);
- cond_resched();
- }
-
- /* Check for outstanding write errors */
- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
- ret = -ENOSPC;
- if (test_and_clear_bit(AS_EIO, &mapping->flags))
- ret = -EIO;
-
- return ret;
-}
-
/*
* add a given inode to the list of inodes that must be fully on
* disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3d31c8827b01..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
/* extent length on disk */
u64 disk_len;
+ /* number of bytes that still need writing */
+ u64 bytes_left;
+
/* flags (described above) */
unsigned long flags;
@@ -150,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
int btrfs_ordered_update_i_size(struct inode *inode,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
- pgoff_t start, pgoff_t end);
-int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end, int sync_mode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 3c0d52af4f80..79cba5fbc28e 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -65,3 +65,23 @@ out:
btrfs_free_path(path);
return ret;
}
+
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c04f7f212602..cfcc93c93a7b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -121,6 +121,15 @@ struct inodevec {
int nr;
};
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+ u64 start;
+ u64 end;
+ u64 boundary[MAX_EXTENTS];
+ unsigned int nr;
+};
+
struct reloc_control {
/* block group to relocate */
struct btrfs_block_group_cache *block_group;
@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
struct reloc_control *rc)
{
if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
return 1;
return 0;
}
@@ -2529,56 +2538,94 @@ out:
}
static noinline_for_stack
-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+ u64 block_start)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret = 0;
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em)
+ return -ENOMEM;
+
+ em->start = start;
+ em->len = end + 1 - start;
+ em->block_len = em->len;
+ em->block_start = block_start;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
{
u64 page_start;
u64 page_end;
- unsigned long i;
- unsigned long first_index;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ unsigned long index;
unsigned long last_index;
- unsigned int total_read = 0;
- unsigned int total_dirty = 0;
+ unsigned int dirty_page = 0;
struct page *page;
struct file_ra_state *ra;
- struct btrfs_ordered_extent *ordered;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int nr = 0;
int ret = 0;
+ if (!cluster->nr)
+ return 0;
+
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
return -ENOMEM;
+ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+
mutex_lock(&inode->i_mutex);
- first_index = start >> PAGE_CACHE_SHIFT;
- last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
- /* make sure the dirty trick played by the caller work */
- while (1) {
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- first_index, last_index);
- if (ret != -EBUSY)
- break;
- schedule_timeout(HZ/10);
- }
+ i_size_write(inode, cluster->end + 1 - offset);
+ ret = setup_extent_mapping(inode, cluster->start - offset,
+ cluster->end - offset, cluster->start);
if (ret)
goto out_unlock;
file_ra_state_init(ra, inode->i_mapping);
- for (i = first_index ; i <= last_index; i++) {
- if (total_read % ra->ra_pages == 0) {
- btrfs_force_ra(inode->i_mapping, ra, NULL, i,
- min(last_index, ra->ra_pages + i - 1));
- }
- total_read++;
-again:
- if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
- BUG_ON(1);
- page = grab_cache_page(inode->i_mapping, i);
+ WARN_ON(cluster->start != cluster->boundary[0]);
+ while (index <= last_index) {
+ page = find_lock_page(inode->i_mapping, index);
if (!page) {
- ret = -ENOMEM;
- goto out_unlock;
+ page_cache_sync_readahead(inode->i_mapping,
+ ra, NULL, index,
+ last_index + 1 - index);
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ }
+
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(inode->i_mapping,
+ ra, NULL, page, index,
+ last_index + 1 - index);
}
+
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
@@ -2589,75 +2636,79 @@ again:
goto out_unlock;
}
}
- wait_on_page_writeback(page);
page_start = (u64)page->index << PAGE_CACHE_SHIFT;
page_end = page_start + PAGE_CACHE_SIZE - 1;
- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
- unlock_page(page);
- page_cache_release(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
+
+ lock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end, GFP_NOFS);
+
set_page_extent_mapped(page);
- if (i == first_index)
- set_extent_bits(io_tree, page_start, page_end,
+ if (nr < cluster->nr &&
+ page_start + offset == cluster->boundary[nr]) {
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end,
EXTENT_BOUNDARY, GFP_NOFS);
+ nr++;
+ }
btrfs_set_extent_delalloc(inode, page_start, page_end);
set_page_dirty(page);
- total_dirty++;
+ dirty_page++;
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
+
+ index++;
+ if (nr < cluster->nr &&
+ page_end + 1 + offset == cluster->boundary[nr]) {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_page);
+ dirty_page = 0;
+ }
+ }
+ if (dirty_page) {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_page);
}
+ WARN_ON(nr != cluster->nr);
out_unlock:
mutex_unlock(&inode->i_mutex);
kfree(ra);
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
return ret;
}
static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+ struct file_extent_cluster *cluster)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct extent_map *em;
- u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
- u64 end = start + extent_key->offset - 1;
-
- em = alloc_extent_map(GFP_NOFS);
- em->start = start;
- em->len = extent_key->offset;
- em->block_len = extent_key->offset;
- em->block_start = extent_key->objectid;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ int ret;
- /* setup extent map to cheat btrfs_readpage */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
- while (1) {
- int ret;
- spin_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
- }
- btrfs_drop_extent_cache(inode, start, end, 0);
+ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
- return relocate_inode_pages(inode, start, extent_key->offset);
+ if (!cluster->nr)
+ cluster->start = extent_key->objectid;
+ else
+ BUG_ON(cluster->nr >= MAX_EXTENTS);
+ cluster->end = extent_key->objectid + extent_key->offset - 1;
+ cluster->boundary[cluster->nr] = extent_key->objectid;
+ cluster->nr++;
+
+ if (cluster->nr >= MAX_EXTENTS) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+ return 0;
}
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
return 0;
}
+
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
+ struct file_extent_cluster *cluster;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
int ret;
int err = 0;
+ cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
+ if (!cluster)
+ return -ENOMEM;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ rc->extents_found = 0;
+ rc->extents_skipped = 0;
+
rc->search_start = rc->block_group->key.objectid;
clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
GFP_NOFS);
@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_end_transaction(trans, rc->extent_root);
trans = NULL;
btrfs_btree_balance_dirty(rc->extent_root, nr);
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = 1;
- ret = relocate_data_extent(rc->data_inode, &key);
+ ret = relocate_data_extent(rc->data_inode,
+ &key, cluster);
if (ret < 0) {
err = ret;
break;
@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
btrfs_btree_balance_dirty(rc->extent_root, nr);
}
+ if (!err) {
+ ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+ if (ret < 0)
+ err = ret;
+ }
+
+ kfree(cluster);
+
rc->create_reloc_root = 0;
smp_mb();
@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 size)
+ struct btrfs_root *root, u64 objectid)
{
struct btrfs_path *path;
struct btrfs_inode_item *item;
@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
btrfs_set_inode_generation(leaf, item, 1);
- btrfs_set_inode_size(leaf, item, size);
+ btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
btrfs_mark_buffer_dirty(leaf);
@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
if (err)
goto out;
- err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
- BUG_ON(err);
-
- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
- group->key.offset, 0, group->key.offset,
- 0, 0, 0);
+ err = __insert_orphan_inode(trans, root, objectid);
BUG_ON(err);
key.objectid = objectid;
@@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
BUG_ON(!rc->block_group);
btrfs_init_workers(&rc->workers, "relocate",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size, NULL);
rc->extent_root = extent_root;
btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_wait_ordered_extents(fs_info->tree_root, 0);
while (1) {
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(fs_info->tree_root);
- mutex_unlock(&fs_info->cleaner_mutex);
-
rc->extents_found = 0;
rc->extents_skipped = 0;
+ mutex_lock(&fs_info->cleaner_mutex);
+
+ btrfs_clean_old_snapshots(fs_info->tree_root);
ret = relocate_block_group(rc);
+
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
err = ret;
break;
@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
}
}
- filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
- rc->block_group->key.objectid,
- rc->block_group->key.objectid +
- rc->block_group->key.offset - 1);
+ filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
+ rc->block_group->key.objectid,
+ rc->block_group->key.objectid +
+ rc->block_group->key.offset - 1);
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
@@ -3530,6 +3594,26 @@ out:
return err;
}
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+
+ memset(&root->root_item.drop_progress, 0,
+ sizeof(root->root_item.drop_progress));
+ root->root_item.drop_level = 0;
+ btrfs_set_root_refs(&root->root_item, 0);
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ BUG_ON(ret);
+
+ ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
+ BUG_ON(ret);
+ return 0;
+}
+
/*
* recover relocation interrupted by system crash.
*
@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(root->fs_info,
reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
- err = PTR_ERR(fs_root);
- goto out;
+ ret = PTR_ERR(fs_root);
+ if (ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+ mark_garbage_root(reloc_root);
}
}
@@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
mapping_tree_init(&rc->reloc_root_tree);
INIT_LIST_HEAD(&rc->reloc_roots);
btrfs_init_workers(&rc->workers, "relocate",
- root->fs_info->thread_pool_size);
+ root->fs_info->thread_pool_size, NULL);
rc->extent_root = root->fs_info->extent_root;
set_reloc_control(rc);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0ddc6d61c55a..9351428f30e2 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
goto out;
BUG_ON(ret == 0);
+ if (path->slots[0] == 0) {
+ ret = 1;
+ goto out;
+ }
l = path->nodes[0];
- BUG_ON(path->slots[0] == 0);
slot = path->slots[0] - 1;
btrfs_item_key_to_cpu(l, &found_key, slot);
- if (found_key.objectid != objectid) {
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
- read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
- sizeof(*item));
- memcpy(key, &found_key, sizeof(found_key));
+ if (item)
+ read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+ sizeof(*item));
+ if (key)
+ memcpy(key, &found_key, sizeof(found_key));
ret = 0;
out:
btrfs_free_path(path);
@@ -249,6 +255,59 @@ err:
return ret;
}
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int err = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(tree_root, path);
+
+ if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ ret = btrfs_find_dead_roots(tree_root, key.offset);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ key.offset++;
+ }
+
+ btrfs_free_path(path);
+ return err;
+}
+
/* drop the root item for 'key' from 'root' */
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key)
@@ -278,31 +337,57 @@ out:
return ret;
}
-#if 0 /* this will get used when snapshot deletion is implemented */
int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id)
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ const char *name, int name_len)
+
{
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
struct btrfs_key key;
+ unsigned long ptr;
+ int err = 0;
int ret;
- struct btrfs_path *path;
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
key.objectid = root_id;
- key.type = type;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
-
+again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret);
-
- ret = btrfs_del_item(trans, tree_root, path);
- BUG_ON(ret);
+ BUG_ON(ret < 0);
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+
+ WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+ WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+ ptr = (unsigned long)(ref + 1);
+ WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+ *sequence = btrfs_root_ref_sequence(leaf, ref);
+
+ ret = btrfs_del_item(trans, tree_root, path);
+ BUG_ON(ret);
+ } else
+ err = -ENOENT;
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(tree_root, path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
btrfs_free_path(path);
- return ret;
+ return err;
}
-#endif
int btrfs_find_root_ref(struct btrfs_root *tree_root,
struct btrfs_path *path,
@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
return ret;
}
-
/*
* add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
* or BTRFS_ROOT_BACKREF_KEY.
@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id,
- u64 dirid, u64 sequence,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
const char *name, int name_len)
{
struct btrfs_key key;
@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
unsigned long ptr;
-
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
key.objectid = root_id;
- key.type = type;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
-
+again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
BUG_ON(ret);
@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, name, ptr, name_len);
btrfs_mark_buffer_dirty(leaf);
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(tree_root, path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
btrfs_free_path(path);
- return ret;
+ return 0;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2db17cd66fc5..752a5463bf53 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,7 +66,8 @@ enum {
Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
- Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
+ Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+ Opt_discard, Opt_err,
};
static match_table_t tokens = {
@@ -88,6 +89,7 @@ static match_table_t tokens = {
{Opt_notreelog, "notreelog"},
{Opt_flushoncommit, "flushoncommit"},
{Opt_ratio, "metadata_ratio=%d"},
+ {Opt_discard, "discard"},
{Opt_err, NULL},
};
@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->metadata_ratio);
}
break;
+ case Opt_discard:
+ btrfs_set_opt(info->mount_opt, DISCARD);
+ break;
default:
break;
}
@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_export_op = &btrfs_export_ops;
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
+#endif
tree_root = open_ctree(sb, fs_devices, (char *)data);
@@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb)
}
static const struct super_operations btrfs_super_ops = {
+ .drop_inode = btrfs_drop_inode,
.delete_inode = btrfs_delete_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cdbb5022da52..bca82a4ca8e6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
{
if (root->ref_cows && root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
- WARN_ON(root->root_item.refs == 0);
WARN_ON(root->commit_root != root->node);
radix_tree_tag_set(&root->fs_info->fs_roots_radix,
@@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->alloc_exclude_start = 0;
h->delayed_ref_updates = 0;
+ if (!current->journal_info)
+ current->journal_info = h;
+
root->fs_info->running_transaction->use_count++;
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
@@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up(&cur_trans->writer_wait);
put_transaction(cur_trans);
mutex_unlock(&info->trans_mutex);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
/*
* when btree blocks are allocated, they have some corresponding bits set for
* them in one of two extent_io trees. This is used to make sure all of
- * those extents are on disk for transaction or log commit
+ * those extents are sent to disk but does not wait on them
*/
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages)
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
{
int ret;
int err = 0;
@@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
page_cache_release(page);
}
}
+ if (err)
+ werr = err;
+ return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit. We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
+{
+ int ret;
+ int err = 0;
+ int werr = 0;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ u64 start = 0;
+ u64 end;
+ unsigned long index;
+
while (1) {
ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
EXTENT_DIRTY);
@@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
return werr;
}
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
+{
+ int ret;
+ int ret2;
+
+ ret = btrfs_write_marked_extents(root, dirty_pages);
+ ret2 = btrfs_wait_marked_extents(root, dirty_pages);
+ return ret || ret2;
+}
+
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
key.objectid = objectid;
- key.offset = 0;
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
old = btrfs_lock_root_node(root);
@@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(&pending->root_key, &key, sizeof(key));
fail:
kfree(new_root_item);
+ btrfs_unreserve_metadata_space(root, 6);
return ret;
}
@@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
- /* add the backref first */
ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
pending->root_key.objectid,
- BTRFS_ROOT_BACKREF_KEY,
parent_root->root_key.objectid,
parent_inode->i_ino, index, pending->name,
namelen);
BUG_ON(ret);
- /* now add the forward ref */
- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
- parent_root->root_key.objectid,
- BTRFS_ROOT_REF_KEY,
- pending->root_key.objectid,
- parent_inode->i_ino, index, pending->name,
- namelen);
-
inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
d_instantiate(pending->dentry, inode);
fail:
@@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
unsigned long timeout = 1;
struct btrfs_transaction *cur_trans;
struct btrfs_transaction *prev_trans = NULL;
- struct extent_io_tree *pinned_copy;
DEFINE_WAIT(wait);
int ret;
int should_grow = 0;
@@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return 0;
}
- pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
- if (!pinned_copy)
- return -ENOMEM;
-
- extent_io_tree_init(pinned_copy,
- root->fs_info->btree_inode->i_mapping, GFP_NOFS);
-
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = commit_cowonly_roots(trans, root);
BUG_ON(ret);
+ btrfs_prepare_extent_commit(trans, root);
+
cur_trans = root->fs_info->running_transaction;
spin_lock(&root->fs_info->new_trans_lock);
root->fs_info->running_transaction = NULL;
@@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
sizeof(root->fs_info->super_copy));
- btrfs_copy_pinned(root, pinned_copy);
-
trans->transaction->blocked = 0;
wake_up(&root->fs_info->transaction_wait);
@@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->fs_info->tree_log_mutex);
- btrfs_finish_extent_commit(trans, root, pinned_copy);
- kfree(pinned_copy);
+ btrfs_finish_extent_commit(trans, root);
/* do the directory inserts of any pending snapshot creations */
finish_pending_snapshots(trans, root->fs_info);
@@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return ret;
}
@@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
while (!list_empty(&list)) {
root = list_entry(list.next, struct btrfs_root, root_list);
- list_del_init(&root->root_list);
- btrfs_drop_snapshot(root, 0);
+ list_del(&root->root_list);
+
+ if (btrfs_header_backref_rev(root->node) <
+ BTRFS_MIXED_BACKREF_REV)
+ btrfs_drop_snapshot(root, 0);
+ else
+ btrfs_drop_snapshot(root, 1);
}
return 0;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 663c67404918..d4e3e7a6938c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
{
BTRFS_I(inode)->last_trans = trans->transaction->transid;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages);
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages);
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 30c0d45c1b5e..741666a7676a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
if (root->log_root) {
+ if (!root->log_start_pid) {
+ root->log_start_pid = current->pid;
+ root->log_multiple_pids = false;
+ } else if (root->log_start_pid != current->pid) {
+ root->log_multiple_pids = true;
+ }
+
root->log_batch++;
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
return 0;
}
+ root->log_multiple_pids = false;
+ root->log_start_pid = current->pid;
mutex_lock(&root->fs_info->tree_log_mutex);
if (!root->fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
- btrfs_update_pinned_extents(log->fs_info->extent_root,
- eb->start, eb->len, 1);
+ btrfs_pin_extent(log->fs_info->extent_root,
+ eb->start, eb->len, 0);
if (btrfs_buffer_uptodate(eb, gen)) {
if (wc->write)
@@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
saved_nbytes = inode_get_bytes(inode);
/* drop any overlapping extents */
ret = btrfs_drop_extents(trans, root, inode,
- start, extent_end, extent_end, start, &alloc_hint);
+ start, extent_end, extent_end, start, &alloc_hint, 1);
BUG_ON(ret);
if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+ u64 log_transid = 0;
mutex_lock(&root->log_mutex);
index1 = root->log_transid % 2;
@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
while (1) {
unsigned long batch = root->log_batch;
- mutex_unlock(&root->log_mutex);
- schedule_timeout_uninterruptible(1);
- mutex_lock(&root->log_mutex);
-
+ if (root->log_multiple_pids) {
+ mutex_unlock(&root->log_mutex);
+ schedule_timeout_uninterruptible(1);
+ mutex_lock(&root->log_mutex);
+ }
wait_for_writer(trans, root);
if (batch == root->log_batch)
break;
@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+ /* we start IO on all the marked extents here, but we don't actually
+ * wait for them until later.
+ */
+ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
BUG_ON(ret);
btrfs_set_root_node(&log->root_item, log->node);
root->log_batch = 0;
+ log_transid = root->log_transid;
root->log_transid++;
log->log_transid = root->log_transid;
+ root->log_start_pid = 0;
smp_mb();
/*
* log tree has been flushed to disk, new modifications of
@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* check the full commit flag again
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
&log_root_tree->dirty_log_pages);
BUG_ON(ret);
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
btrfs_set_super_log_root(&root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* the running transaction open, so a full commit can't hop
* in and cause problems either.
*/
- write_ctree_super(trans, root->fs_info->tree_root, 2);
+ write_ctree_super(trans, root->fs_info->tree_root, 1);
ret = 0;
+ mutex_lock(&root->log_mutex);
+ if (root->last_log_commit < log_transid)
+ root->last_log_commit = log_transid;
+ mutex_unlock(&root->log_mutex);
+
out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
smp_mb();
@@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
- if (parent == sb->s_root)
+ if (IS_ROOT(parent))
break;
parent = parent->d_parent;
@@ -2852,6 +2876,21 @@ out:
return ret;
}
+static int inode_in_log(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (BTRFS_I(inode)->logged_trans == trans->transid &&
+ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+ ret = 1;
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
+ if (root != BTRFS_I(inode)->root ||
+ btrfs_root_refs(&root->root_item) == 0) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
ret = check_parent_dirs_for_sync(trans, inode, parent,
sb, last_committed);
if (ret)
goto end_no_trans;
+ if (inode_in_log(trans, inode)) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto end_no_trans;
+ }
+
start_log_trans(trans, root);
ret = btrfs_log_inode(trans, root, inode, inode_only);
@@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
break;
inode = parent->d_inode;
+ if (root != BTRFS_I(inode)->root)
+ break;
+
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only);
BUG_ON(ret);
}
- if (parent == sb->s_root)
+ if (IS_ROOT(parent))
break;
parent = parent->d_parent;
@@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_key tmp_key;
struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
- u64 highest_inode;
struct walk_control wc = {
.process_func = process_one_buffer,
.stage = 0,
@@ -3010,11 +3062,6 @@ again:
path);
BUG_ON(ret);
}
- ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
- if (ret == 0) {
- wc.replay_dest->highest_inode = highest_inode;
- wc.replay_dest->last_inode_alloc = highest_inode;
- }
key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index d09c7609e16b..0776eacb5083 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
#ifndef __TREE_LOG_
#define __TREE_LOG_
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cf405b0828d..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -276,7 +276,7 @@ loop_lock:
* is now congested. Back off and let other work structs
* run instead
*/
- if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
+ if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
fs_info->fs_devices->open_devices > 1) {
struct io_context *ioc;
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
goto error;
device->name = kstrdup(orig_dev->name, GFP_NOFS);
- if (!device->name)
+ if (!device->name) {
+ kfree(device);
goto error;
+ }
device->devid = orig_dev->devid;
device->work.func = pending_bios_fn;
@@ -719,10 +721,9 @@ error:
* called very infrequently and that a given device has a small number
* of extents
*/
-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 num_bytes, u64 *start,
- u64 *max_avail)
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
BUG_ON(ret);
@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
* step two, delete the device extents and the
* chunk tree entries
*/
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(em->start > chunk_offset ||
em->start + em->len < chunk_offset);
@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
BUG_ON(ret);
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
kfree(map);
em->bdev = NULL;
@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
struct btrfs_key found_key;
u64 chunk_tree = chunk_root->root_key.objectid;
u64 chunk_type;
+ bool retried = false;
+ int failed = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
found_key.objectid,
found_key.offset);
- BUG_ON(ret);
+ if (ret == -ENOSPC)
+ failed++;
+ else if (ret)
+ BUG();
}
if (found_key.offset == 0)
@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
key.offset = found_key.offset - 1;
}
ret = 0;
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ WARN_ON(1);
+ ret = -ENOSPC;
+ }
error:
btrfs_free_path(path);
return ret;
@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
+ if (ret == -ENOSPC)
+ break;
BUG_ON(ret);
trans = btrfs_start_transaction(dev_root, 1);
@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
chunk = btrfs_item_ptr(path->nodes[0],
path->slots[0],
struct btrfs_chunk);
- key.offset = found_key.offset;
/* chunk zero is special */
- if (key.offset == 0)
+ if (found_key.offset == 0)
break;
btrfs_release_path(chunk_root, path);
@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
- BUG_ON(ret);
+ BUG_ON(ret && ret != -ENOSPC);
+ key.offset = found_key.offset - 1;
}
ret = 0;
error:
@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 chunk_offset;
int ret;
int slot;
+ int failed = 0;
+ bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
if (new_size >= device->total_bytes)
@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto done;
- }
-
path->reada = 2;
lock_chunks(root);
@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (device->writeable)
device->fs_devices->total_rw_bytes -= diff;
unlock_chunks(root);
- btrfs_end_transaction(trans, root);
+again:
key.objectid = device->devid;
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
goto done;
if (ret) {
ret = 0;
+ btrfs_release_path(root, path);
break;
}
@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
- if (key.objectid != device->devid)
+ if (key.objectid != device->devid) {
+ btrfs_release_path(root, path);
break;
+ }
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
- if (key.offset + length <= new_size)
+ if (key.offset + length <= new_size) {
+ btrfs_release_path(root, path);
break;
+ }
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
chunk_offset);
- if (ret)
+ if (ret && ret != -ENOSPC)
goto done;
+ if (ret == -ENOSPC)
+ failed++;
+ key.offset -= 1;
+ }
+
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ ret = -ENOSPC;
+ lock_chunks(root);
+
+ device->total_bytes = old_size;
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ unlock_chunks(root);
+ goto done;
}
/* Shrinking succeeded, else we would be at "done". */
@@ -2294,9 +2335,9 @@ again:
em->block_len = em->len;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
BUG_ON(ret);
free_extent_map(em);
@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
int readonly = 0;
int i;
- spin_lock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
- spin_unlock(&map_tree->map_tree.lock);
+ read_unlock(&map_tree->map_tree.lock);
if (!em)
return 1;
@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
struct extent_map *em;
while (1) {
- spin_lock(&tree->map_tree.lock);
+ write_lock(&tree->map_tree.lock);
em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
if (em)
remove_extent_mapping(&tree->map_tree, em);
- spin_unlock(&tree->map_tree.lock);
+ write_unlock(&tree->map_tree.lock);
if (!em)
break;
kfree(em->bdev);
@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2604,9 +2645,9 @@ again:
atomic_set(&multi->error, 0);
}
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em && unplug_page)
return 0;
@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 stripe_nr;
int i, j, nr = 0;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_start, 1);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(!em || em->start != chunk_start);
map = (struct map_lookup *)em->bdev;
@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
- spin_lock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
- spin_unlock(&map_tree->map_tree.lock);
+ read_unlock(&map_tree->map_tree.lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) {
@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev->in_fs_metadata = 1;
}
- spin_lock(&map_tree->map_tree.lock);
+ write_lock(&map_tree->map_tree.lock);
ret = add_extent_mapping(&map_tree->map_tree, em);
- spin_unlock(&map_tree->map_tree.lock);
+ write_unlock(&map_tree->map_tree.lock);
BUG_ON(ret);
free_extent_map(em);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5139a833f721..31b0fabdd2ea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
void btrfs_unlock_volumes(void);
void btrfs_lock_volumes(void);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b6dd5967c48a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
* attributes are handled directly.
*/
struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
&btrfs_xattr_acl_access_handler,
&btrfs_xattr_acl_default_handler,
#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 209f7f15f5f8..6fa530256bfd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -280,7 +280,7 @@ void invalidate_bdev(struct block_device *bdev)
EXPORT_SYMBOL(invalidate_bdev);
/*
- * Kick pdflush then try to free up some ZONE_NORMAL memory.
+ * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
*/
static void free_more_memory(void)
{
@@ -1709,9 +1709,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
/*
* If it's a fully non-blocking write attempt and we cannot
* lock the buffer then redirty the page. Note that this can
- * potentially cause a busy-wait loop from pdflush and kswapd
- * activity, but those code paths have their own higher-level
- * throttling.
+ * potentially cause a busy-wait loop from writeback threads
+ * and kswapd activity, but those code paths have their own
+ * higher-level throttling.
*/
if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
lock_buffer(bh);
@@ -2239,16 +2239,10 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
- unsigned long limit;
int err;
- err = -EFBIG;
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && size > (loff_t)limit) {
- send_sig(SIGXFSZ, current, 0);
- goto out;
- }
- if (size > inode->i_sb->s_maxbytes)
+ err = inode_newsize_ok(inode, size);
+ if (err)
goto out;
err = pagecache_write_begin(NULL, mapping, size, 0,
@@ -3214,7 +3208,7 @@ EXPORT_SYMBOL(block_sync_page);
* still running obsolete flush daemons, so we terminate them here.
*
* Use of bdflush() is deprecated and will be removed in a future kernel.
- * The `pdflush' kernel threads fully replace bdflush daemons and this call.
+ * The `flush-X' kernel threads fully replace bdflush daemons and this call.
*/
SYSCALL_DEFINE2(bdflush, int, func, long, data)
{
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cbc57f932d2..d6db933df2b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -264,7 +264,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
{
struct char_device_struct *cd;
struct cdev *cdev;
- char *s;
int err = -ENOMEM;
cd = __register_chrdev_region(major, baseminor, count, name);
@@ -278,8 +277,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
cdev->owner = fops->owner;
cdev->ops = fops;
kobject_set_name(&cdev->kobj, "%s", name);
- for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
- *s = '!';
err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
if (err)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 6994a0f54f02..80f352596807 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,7 @@ config CIFS
tristate "CIFS support (advanced network filesystem, SMBFS successor)"
depends on INET
select NLS
+ select SLOW_WORK
help
This is the client VFS module for the Common Internet File System
(CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d79ce2e95c23..9a5e4f5f3122 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -64,9 +64,6 @@ unsigned int multiuser_mount = 0;
unsigned int extended_security = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
-extern struct task_struct *oplockThread; /* remove sparse warning */
-struct task_struct *oplockThread = NULL;
-/* extern struct task_struct * dnotifyThread; remove sparse warning */
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, int, 0);
@@ -185,8 +182,7 @@ out_mount_failed:
cifs_sb->mountdata = NULL;
}
#endif
- if (cifs_sb->local_nls)
- unload_nls(cifs_sb->local_nls);
+ unload_nls(cifs_sb->local_nls);
kfree(cifs_sb);
}
return rc;
@@ -973,89 +969,12 @@ cifs_destroy_mids(void)
kmem_cache_destroy(cifs_oplock_cachep);
}
-static int cifs_oplock_thread(void *dummyarg)
-{
- struct oplock_q_entry *oplock_item;
- struct cifsTconInfo *pTcon;
- struct inode *inode;
- __u16 netfid;
- int rc, waitrc = 0;
-
- set_freezable();
- do {
- if (try_to_freeze())
- continue;
-
- spin_lock(&cifs_oplock_lock);
- if (list_empty(&cifs_oplock_list)) {
- spin_unlock(&cifs_oplock_lock);
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(39*HZ);
- } else {
- oplock_item = list_entry(cifs_oplock_list.next,
- struct oplock_q_entry, qhead);
- cFYI(1, ("found oplock item to write out"));
- pTcon = oplock_item->tcon;
- inode = oplock_item->pinode;
- netfid = oplock_item->netfid;
- spin_unlock(&cifs_oplock_lock);
- DeleteOplockQEntry(oplock_item);
- /* can not grab inode sem here since it would
- deadlock when oplock received on delete
- since vfs_unlink holds the i_mutex across
- the call */
- /* mutex_lock(&inode->i_mutex);*/
- if (S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
- if (CIFS_I(inode)->clientCanCacheAll == 0)
- break_lease(inode, FMODE_READ);
- else if (CIFS_I(inode)->clientCanCacheRead == 0)
- break_lease(inode, FMODE_WRITE);
-#endif
- rc = filemap_fdatawrite(inode->i_mapping);
- if (CIFS_I(inode)->clientCanCacheRead == 0) {
- waitrc = filemap_fdatawait(
- inode->i_mapping);
- invalidate_remote_inode(inode);
- }
- if (rc == 0)
- rc = waitrc;
- } else
- rc = 0;
- /* mutex_unlock(&inode->i_mutex);*/
- if (rc)
- CIFS_I(inode)->write_behind_rc = rc;
- cFYI(1, ("Oplock flush inode %p rc %d",
- inode, rc));
-
- /* releasing stale oplock after recent reconnect
- of smb session using a now incorrect file
- handle is not a data integrity issue but do
- not bother sending an oplock release if session
- to server still is disconnected since oplock
- already released by the server in that case */
- if (!pTcon->need_reconnect) {
- rc = CIFSSMBLock(0, pTcon, netfid,
- 0 /* len */ , 0 /* offset */, 0,
- 0, LOCKING_ANDX_OPLOCK_RELEASE,
- false /* wait flag */);
- cFYI(1, ("Oplock release rc = %d", rc));
- }
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1); /* yield in case q were corrupt */
- }
- } while (!kthread_should_stop());
-
- return 0;
-}
-
static int __init
init_cifs(void)
{
int rc = 0;
cifs_proc_init();
INIT_LIST_HEAD(&cifs_tcp_ses_list);
- INIT_LIST_HEAD(&cifs_oplock_list);
#ifdef CONFIG_CIFS_EXPERIMENTAL
INIT_LIST_HEAD(&GlobalDnotifyReqList);
INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1084,7 +1003,6 @@ init_cifs(void)
rwlock_init(&GlobalSMBSeslock);
rwlock_init(&cifs_tcp_ses_lock);
spin_lock_init(&GlobalMid_Lock);
- spin_lock_init(&cifs_oplock_lock);
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
@@ -1119,16 +1037,13 @@ init_cifs(void)
if (rc)
goto out_unregister_key_type;
#endif
- oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd");
- if (IS_ERR(oplockThread)) {
- rc = PTR_ERR(oplockThread);
- cERROR(1, ("error %d create oplock thread", rc));
- goto out_unregister_dfs_key_type;
- }
+ rc = slow_work_register_user();
+ if (rc)
+ goto out_unregister_resolver_key;
return 0;
- out_unregister_dfs_key_type:
+ out_unregister_resolver_key:
#ifdef CONFIG_CIFS_DFS_UPCALL
unregister_key_type(&key_type_dns_resolver);
out_unregister_key_type:
@@ -1165,7 +1080,6 @@ exit_cifs(void)
cifs_destroy_inodecache();
cifs_destroy_mids();
cifs_destroy_request_bufs();
- kthread_stop(oplockThread);
}
MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6cfc81a32703..5d0fde18039c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
*/
#include <linux/in.h>
#include <linux/in6.h>
+#include <linux/slow-work.h>
#include "cifs_fs_sb.h"
#include "cifsacl.h"
/*
@@ -346,14 +347,16 @@ struct cifsFileInfo {
/* lock scope id (0 if none) */
struct file *pfile; /* needed for writepage */
struct inode *pInode; /* needed for oplock break */
+ struct vfsmount *mnt;
struct mutex lock_mutex;
struct list_head llist; /* list of byte range locks we have. */
bool closePend:1; /* file is marked to close */
bool invalidHandle:1; /* file closed via session abend */
- bool messageMode:1; /* for pipes: message vs byte mode */
+ bool oplock_break_cancelled:1;
atomic_t count; /* reference count */
struct mutex fh_mutex; /* prevents reopen race after dead ses*/
struct cifs_search_info srch_inf;
+ struct slow_work oplock_break; /* slow_work job for oplock breaks */
};
/* Take a reference on the file private data */
@@ -365,8 +368,10 @@ static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
/* Release a reference on the file private data */
static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
- if (atomic_dec_and_test(&cifs_file->count))
+ if (atomic_dec_and_test(&cifs_file->count)) {
+ iput(cifs_file->pInode);
kfree(cifs_file);
+ }
}
/*
@@ -382,7 +387,6 @@ struct cifsInodeInfo {
unsigned long time; /* jiffies of last update/check of inode */
bool clientCanCacheRead:1; /* read oplock */
bool clientCanCacheAll:1; /* read and writebehind oplock */
- bool oplockPending:1;
bool delete_pending:1; /* DELETE_ON_CLOSE is set */
u64 server_eof; /* current file size on server */
u64 uniqueid; /* server inode number */
@@ -585,9 +589,9 @@ require use of the stronger protocol */
#define CIFSSEC_MUST_LANMAN 0x10010
#define CIFSSEC_MUST_PLNTXT 0x20020
#ifdef CONFIG_CIFS_UPCALL
-#define CIFSSEC_MASK 0xAF0AF /* allows weak security but also krb5 */
+#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */
#else
-#define CIFSSEC_MASK 0xA70A7 /* current flags supported if weak */
+#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
#endif /* UPCALL */
#else /* do not allow weak pw hash */
#ifdef CONFIG_CIFS_UPCALL
@@ -669,12 +673,6 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
*/
GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
-/* Global list of oplocks */
-GLOBAL_EXTERN struct list_head cifs_oplock_list;
-
-/* Protects the cifs_oplock_list */
-GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
-
/* Outstanding dir notify requests */
GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
/* DirNotify response queue */
@@ -725,3 +723,4 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern const struct slow_work_ops cifs_oplock_break_ops;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index da8fbf565991..6928c24d1d42 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -86,18 +86,17 @@ extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
const int stage,
const struct nls_table *nls_cp);
extern __u16 GetNextMid(struct TCP_Server_Info *server);
-extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
- struct cifsTconInfo *);
-extern void DeleteOplockQEntry(struct oplock_q_entry *);
-extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
int offset);
+extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
+ __u16 fileHandle, struct file *file,
+ struct vfsmount *mnt, unsigned int oflags);
extern int cifs_posix_open(char *full_path, struct inode **pinode,
- struct super_block *sb, int mode, int oflags,
- int *poplock, __u16 *pnetfid, int xid);
+ struct vfsmount *mnt, int mode, int oflags,
+ __u32 *poplock, __u16 *pnetfid, int xid);
extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
FILE_UNIX_BASIC_INFO *info,
struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 301e307e1279..941441d3e386 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -94,6 +94,7 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
open_file = list_entry(tmp, struct cifsFileInfo, tlist);
open_file->invalidHandle = true;
+ open_file->oplock_break_cancelled = true;
}
write_unlock(&GlobalSMBSeslock);
/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d49682433c20..43003e0bef18 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1670,7 +1670,6 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
CIFSSMBTDis(xid, tcon);
_FreeXid(xid);
- DeleteTconOplockQEntries(tcon);
tconInfoFree(tcon);
cifs_put_smb_ses(ses);
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index a6424cfc0121..627a60a6c1b1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -24,6 +24,7 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/namei.h>
+#include <linux/mount.h>
#include "cifsfs.h"
#include "cifspdu.h"
#include "cifsglob.h"
@@ -129,44 +130,45 @@ cifs_bp_rename_retry:
return full_path;
}
-static void
-cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
- struct cifsTconInfo *tcon, bool write_only)
+struct cifsFileInfo *
+cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
+ struct file *file, struct vfsmount *mnt, unsigned int oflags)
{
int oplock = 0;
struct cifsFileInfo *pCifsFile;
struct cifsInodeInfo *pCifsInode;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-
if (pCifsFile == NULL)
- return;
+ return pCifsFile;
if (oplockEnabled)
oplock = REQ_OPLOCK;
pCifsFile->netfid = fileHandle;
pCifsFile->pid = current->tgid;
- pCifsFile->pInode = newinode;
+ pCifsFile->pInode = igrab(newinode);
+ pCifsFile->mnt = mnt;
+ pCifsFile->pfile = file;
pCifsFile->invalidHandle = false;
pCifsFile->closePend = false;
mutex_init(&pCifsFile->fh_mutex);
mutex_init(&pCifsFile->lock_mutex);
INIT_LIST_HEAD(&pCifsFile->llist);
atomic_set(&pCifsFile->count, 1);
+ slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
- /* set the following in open now
- pCifsFile->pfile = file; */
write_lock(&GlobalSMBSeslock);
- list_add(&pCifsFile->tlist, &tcon->openFileList);
+ list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
pCifsInode = CIFS_I(newinode);
if (pCifsInode) {
/* if readable file instance put first in list*/
- if (write_only)
+ if (oflags & FMODE_READ)
+ list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+ else
list_add_tail(&pCifsFile->flist,
&pCifsInode->openFileList);
- else
- list_add(&pCifsFile->flist, &pCifsInode->openFileList);
if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
pCifsInode->clientCanCacheAll = true;
@@ -176,18 +178,18 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
pCifsInode->clientCanCacheRead = true;
}
write_unlock(&GlobalSMBSeslock);
+
+ return pCifsFile;
}
int cifs_posix_open(char *full_path, struct inode **pinode,
- struct super_block *sb, int mode, int oflags,
- int *poplock, __u16 *pnetfid, int xid)
+ struct vfsmount *mnt, int mode, int oflags,
+ __u32 *poplock, __u16 *pnetfid, int xid)
{
int rc;
- __u32 oplock;
- bool write_only = false;
FILE_UNIX_BASIC_INFO *presp_data;
__u32 posix_flags = 0;
- struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
struct cifs_fattr fattr;
cFYI(1, ("posix open %s", full_path));
@@ -223,12 +225,9 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
if (oflags & O_DIRECT)
posix_flags |= SMB_O_DIRECT;
- if (!(oflags & FMODE_READ))
- write_only = true;
-
mode &= ~current_umask();
rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
- pnetfid, presp_data, &oplock, full_path,
+ pnetfid, presp_data, poplock, full_path,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc)
@@ -244,7 +243,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
/* get new inode and set it up */
if (*pinode == NULL) {
- *pinode = cifs_iget(sb, &fattr);
+ *pinode = cifs_iget(mnt->mnt_sb, &fattr);
if (!*pinode) {
rc = -ENOMEM;
goto posix_open_ret;
@@ -253,7 +252,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
cifs_fattr_to_inode(*pinode, &fattr);
}
- cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
+ cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
posix_open_ret:
kfree(presp_data);
@@ -280,7 +279,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
int rc = -ENOENT;
int xid;
int create_options = CREATE_NOT_DIR;
- int oplock = 0;
+ __u32 oplock = 0;
int oflags;
bool posix_create = false;
/*
@@ -298,7 +297,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
FILE_ALL_INFO *buf = NULL;
struct inode *newinode = NULL;
int disposition = FILE_OVERWRITE_IF;
- bool write_only = false;
xid = GetXid();
@@ -323,7 +321,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
- rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
+ rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
mode, oflags, &oplock, &fileHandle, xid);
/* EIO could indicate that (posix open) operation is not
supported, despite what server claimed in capability
@@ -351,11 +349,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
desiredAccess = 0;
if (oflags & FMODE_READ)
desiredAccess |= GENERIC_READ; /* is this too little? */
- if (oflags & FMODE_WRITE) {
+ if (oflags & FMODE_WRITE)
desiredAccess |= GENERIC_WRITE;
- if (!(oflags & FMODE_READ))
- write_only = true;
- }
if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
disposition = FILE_CREATE;
@@ -470,8 +465,8 @@ cifs_create_set_dentry:
/* mknod case - do not leave file open */
CIFSSMBClose(xid, tcon, fileHandle);
} else if (!(posix_create) && (newinode)) {
- cifs_fill_fileinfo(newinode, fileHandle,
- cifs_sb->tcon, write_only);
+ cifs_new_fileinfo(newinode, fileHandle, NULL,
+ nd->path.mnt, oflags);
}
cifs_create_out:
kfree(buf);
@@ -611,7 +606,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
{
int xid;
int rc = 0; /* to get around spurious gcc warning, set to zero here */
- int oplock = 0;
+ __u32 oplock = 0;
__u16 fileHandle = 0;
bool posix_open = false;
struct cifs_sb_info *cifs_sb;
@@ -683,8 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
(nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
(nd->intent.open.flags & O_CREAT)) {
- rc = cifs_posix_open(full_path, &newInode,
- parent_dir_inode->i_sb,
+ rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
nd->intent.open.create_mode,
nd->intent.open.flags, &oplock,
&fileHandle, xid);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fa7beac8b80e..429337eb7afe 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -30,6 +30,7 @@
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/delay.h>
+#include <linux/mount.h>
#include <asm/div64.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -39,27 +40,6 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
-static inline struct cifsFileInfo *cifs_init_private(
- struct cifsFileInfo *private_data, struct inode *inode,
- struct file *file, __u16 netfid)
-{
- memset(private_data, 0, sizeof(struct cifsFileInfo));
- private_data->netfid = netfid;
- private_data->pid = current->tgid;
- mutex_init(&private_data->fh_mutex);
- mutex_init(&private_data->lock_mutex);
- INIT_LIST_HEAD(&private_data->llist);
- private_data->pfile = file; /* needed for writepage */
- private_data->pInode = inode;
- private_data->invalidHandle = false;
- private_data->closePend = false;
- /* Initialize reference count to one. The private data is
- freed on the release of the last reference */
- atomic_set(&private_data->count, 1);
-
- return private_data;
-}
-
static inline int cifs_convert_flags(unsigned int flags)
{
if ((flags & O_ACCMODE) == O_RDONLY)
@@ -123,9 +103,11 @@ static inline int cifs_get_disposition(unsigned int flags)
}
/* all arguments to this function must be checked for validity in caller */
-static inline int cifs_posix_open_inode_helper(struct inode *inode,
- struct file *file, struct cifsInodeInfo *pCifsInode,
- struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
+static inline int
+cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
+ struct cifsInodeInfo *pCifsInode,
+ struct cifsFileInfo *pCifsFile, __u32 oplock,
+ u16 netfid)
{
write_lock(&GlobalSMBSeslock);
@@ -219,17 +201,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
struct timespec temp;
int rc;
- /* want handles we can use to read with first
- in the list so we do not have to walk the
- list to search for one in write_begin */
- if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
- list_add_tail(&pCifsFile->flist,
- &pCifsInode->openFileList);
- } else {
- list_add(&pCifsFile->flist,
- &pCifsInode->openFileList);
- }
- write_unlock(&GlobalSMBSeslock);
if (pCifsInode->clientCanCacheRead) {
/* we have the inode open somewhere else
no need to discard cache data */
@@ -279,7 +250,8 @@ client_can_cache:
int cifs_open(struct inode *inode, struct file *file)
{
int rc = -EACCES;
- int xid, oplock;
+ int xid;
+ __u32 oplock;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *tcon;
struct cifsFileInfo *pCifsFile;
@@ -324,7 +296,7 @@ int cifs_open(struct inode *inode, struct file *file)
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
int oflags = (int) cifs_posix_convert_flags(file->f_flags);
/* can not refresh inode info since size could be stale */
- rc = cifs_posix_open(full_path, &inode, inode->i_sb,
+ rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
cifs_sb->mnt_file_mode /* ignored */,
oflags, &oplock, &netfid, xid);
if (rc == 0) {
@@ -414,24 +386,17 @@ int cifs_open(struct inode *inode, struct file *file)
cFYI(1, ("cifs_open returned 0x%x", rc));
goto out;
}
- file->private_data =
- kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+
+ pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
+ file->f_flags);
+ file->private_data = pCifsFile;
if (file->private_data == NULL) {
rc = -ENOMEM;
goto out;
}
- pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
- write_lock(&GlobalSMBSeslock);
- list_add(&pCifsFile->tlist, &tcon->openFileList);
- pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
- if (pCifsInode) {
- rc = cifs_open_inode_helper(inode, file, pCifsInode,
- pCifsFile, tcon,
- &oplock, buf, full_path, xid);
- } else {
- write_unlock(&GlobalSMBSeslock);
- }
+ rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon,
+ &oplock, buf, full_path, xid);
if (oplock & CIFS_CREATE_ACTION) {
/* time to set mode which we can not set earlier due to
@@ -474,7 +439,8 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
static int cifs_reopen_file(struct file *file, bool can_flush)
{
int rc = -EACCES;
- int xid, oplock;
+ int xid;
+ __u32 oplock;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *tcon;
struct cifsFileInfo *pCifsFile;
@@ -543,7 +509,7 @@ reopen_error_exit:
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
int oflags = (int) cifs_posix_convert_flags(file->f_flags);
/* can not refresh inode info since size could be stale */
- rc = cifs_posix_open(full_path, NULL, inode->i_sb,
+ rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
cifs_sb->mnt_file_mode /* ignored */,
oflags, &oplock, &netfid, xid);
if (rc == 0) {
@@ -2308,6 +2274,73 @@ out:
return rc;
}
+static void
+cifs_oplock_break(struct slow_work *work)
+{
+ struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
+ oplock_break);
+ struct inode *inode = cfile->pInode;
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
+ int rc, waitrc = 0;
+
+ if (inode && S_ISREG(inode->i_mode)) {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+ if (cinode->clientCanCacheAll == 0)
+ break_lease(inode, FMODE_READ);
+ else if (cinode->clientCanCacheRead == 0)
+ break_lease(inode, FMODE_WRITE);
+#endif
+ rc = filemap_fdatawrite(inode->i_mapping);
+ if (cinode->clientCanCacheRead == 0) {
+ waitrc = filemap_fdatawait(inode->i_mapping);
+ invalidate_remote_inode(inode);
+ }
+ if (!rc)
+ rc = waitrc;
+ if (rc)
+ cinode->write_behind_rc = rc;
+ cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
+ }
+
+ /*
+ * releasing stale oplock after recent reconnect of smb session using
+ * a now incorrect file handle is not a data integrity issue but do
+ * not bother sending an oplock release if session to server still is
+ * disconnected since oplock already released by the server
+ */
+ if (!cfile->closePend && !cfile->oplock_break_cancelled) {
+ rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
+ LOCKING_ANDX_OPLOCK_RELEASE, false);
+ cFYI(1, ("Oplock release rc = %d", rc));
+ }
+}
+
+static int
+cifs_oplock_break_get(struct slow_work *work)
+{
+ struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
+ oplock_break);
+ mntget(cfile->mnt);
+ cifsFileInfo_get(cfile);
+ return 0;
+}
+
+static void
+cifs_oplock_break_put(struct slow_work *work)
+{
+ struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
+ oplock_break);
+ mntput(cfile->mnt);
+ cifsFileInfo_put(cfile);
+}
+
+const struct slow_work_ops cifs_oplock_break_ops = {
+ .get_ref = cifs_oplock_break_get,
+ .put_ref = cifs_oplock_break_put,
+ .execute = cifs_oplock_break,
+};
+
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
.readpages = cifs_readpages,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 1f09c7619319..5e2492535daa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1557,57 +1557,24 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
static int cifs_vmtruncate(struct inode *inode, loff_t offset)
{
- struct address_space *mapping = inode->i_mapping;
- unsigned long limit;
+ loff_t oldsize;
+ int err;
spin_lock(&inode->i_lock);
- if (inode->i_size < offset)
- goto do_expand;
- /*
- * truncation of in-use swapfiles is disallowed - it would cause
- * subsequent swapout to scribble on the now-freed blocks.
- */
- if (IS_SWAPFILE(inode)) {
- spin_unlock(&inode->i_lock);
- goto out_busy;
- }
- i_size_write(inode, offset);
- spin_unlock(&inode->i_lock);
- /*
- * unmap_mapping_range is called twice, first simply for efficiency
- * so that truncate_inode_pages does fewer single-page unmaps. However
- * after this first call, and before truncate_inode_pages finishes,
- * it is possible for private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second unmap_mapping_range
- * call must be made for correctness.
- */
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- goto out_truncate;
-
-do_expand:
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit) {
+ err = inode_newsize_ok(inode, offset);
+ if (err) {
spin_unlock(&inode->i_lock);
- goto out_sig;
- }
- if (offset > inode->i_sb->s_maxbytes) {
- spin_unlock(&inode->i_lock);
- goto out_big;
+ goto out;
}
+
+ oldsize = inode->i_size;
i_size_write(inode, offset);
spin_unlock(&inode->i_lock);
-out_truncate:
+ truncate_pagecache(inode, oldsize, offset);
if (inode->i_op->truncate)
inode->i_op->truncate(inode);
- return 0;
-out_sig:
- send_sig(SIGXFSZ, current, 0);
-out_big:
- return -EFBIG;
-out_busy:
- return -ETXTBSY;
+out:
+ return err;
}
static int
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index e079a9190ec4..0241b25ac33f 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -32,7 +32,6 @@
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
-extern struct task_struct *oplockThread;
/* The xid serves as a useful identifier for each incoming vfs request,
in a similar way to the mid which is useful to track each sent smb,
@@ -500,6 +499,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
struct cifsTconInfo *tcon;
struct cifsInodeInfo *pCifsInode;
struct cifsFileInfo *netfile;
+ int rc;
cFYI(1, ("Checking for oplock break or dnotify response"));
if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -562,30 +562,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
continue;
cifs_stats_inc(&tcon->num_oplock_brks);
- write_lock(&GlobalSMBSeslock);
+ read_lock(&GlobalSMBSeslock);
list_for_each(tmp2, &tcon->openFileList) {
netfile = list_entry(tmp2, struct cifsFileInfo,
tlist);
if (pSMB->Fid != netfile->netfid)
continue;
- write_unlock(&GlobalSMBSeslock);
- read_unlock(&cifs_tcp_ses_lock);
+ /*
+ * don't do anything if file is about to be
+ * closed anyway.
+ */
+ if (netfile->closePend) {
+ read_unlock(&GlobalSMBSeslock);
+ read_unlock(&cifs_tcp_ses_lock);
+ return true;
+ }
+
cFYI(1, ("file id match, oplock break"));
pCifsInode = CIFS_I(netfile->pInode);
pCifsInode->clientCanCacheAll = false;
if (pSMB->OplockLevel == 0)
pCifsInode->clientCanCacheRead = false;
- pCifsInode->oplockPending = true;
- AllocOplockQEntry(netfile->pInode,
- netfile->netfid, tcon);
- cFYI(1, ("about to wake up oplock thread"));
- if (oplockThread)
- wake_up_process(oplockThread);
-
+ rc = slow_work_enqueue(&netfile->oplock_break);
+ if (rc) {
+ cERROR(1, ("failed to enqueue oplock "
+ "break: %d\n", rc));
+ } else {
+ netfile->oplock_break_cancelled = false;
+ }
+ read_unlock(&GlobalSMBSeslock);
+ read_unlock(&cifs_tcp_ses_lock);
return true;
}
- write_unlock(&GlobalSMBSeslock);
+ read_unlock(&GlobalSMBSeslock);
read_unlock(&cifs_tcp_ses_lock);
cFYI(1, ("No matching file for oplock break"));
return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f823a4a208a7..1f098ca71636 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -146,7 +146,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
}
}
-void
+static void
cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
struct cifs_sb_info *cifs_sb)
{
@@ -161,7 +161,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
cifs_fill_common_info(fattr, cifs_sb);
}
-void
+static void
cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
struct cifs_sb_info *cifs_sb)
{
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 1da4ab250eae..07b8e71544ee 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -103,56 +103,6 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
mempool_free(midEntry, cifs_mid_poolp);
}
-struct oplock_q_entry *
-AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
-{
- struct oplock_q_entry *temp;
- if ((pinode == NULL) || (tcon == NULL)) {
- cERROR(1, ("Null parms passed to AllocOplockQEntry"));
- return NULL;
- }
- temp = (struct oplock_q_entry *) kmem_cache_alloc(cifs_oplock_cachep,
- GFP_KERNEL);
- if (temp == NULL)
- return temp;
- else {
- temp->pinode = pinode;
- temp->tcon = tcon;
- temp->netfid = fid;
- spin_lock(&cifs_oplock_lock);
- list_add_tail(&temp->qhead, &cifs_oplock_list);
- spin_unlock(&cifs_oplock_lock);
- }
- return temp;
-}
-
-void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
-{
- spin_lock(&cifs_oplock_lock);
- /* should we check if list empty first? */
- list_del(&oplockEntry->qhead);
- spin_unlock(&cifs_oplock_lock);
- kmem_cache_free(cifs_oplock_cachep, oplockEntry);
-}
-
-
-void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
-{
- struct oplock_q_entry *temp;
-
- if (tcon == NULL)
- return;
-
- spin_lock(&cifs_oplock_lock);
- list_for_each_entry(temp, &cifs_oplock_list, qhead) {
- if ((temp->tcon) && (temp->tcon == tcon)) {
- list_del(&temp->qhead);
- kmem_cache_free(cifs_oplock_cachep, temp);
- }
- }
- spin_unlock(&cifs_oplock_lock);
-}
-
static int
smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
{
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9c..d99860a33890 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
#define _CODA_INT_
struct dentry;
+struct file;
extern struct file_system_type coda_fs_type;
extern unsigned long coda_timeout;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0376ac66c44a..be4392ca2098 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,6 +22,7 @@
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/time.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/ioport.h>
#include <linux/fcntl.h>
diff --git a/fs/compat.c b/fs/compat.c
index 3aa48834a222..d576b552e8e2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -768,13 +768,13 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
char __user * type, unsigned long flags,
void __user * data)
{
- unsigned long type_page;
+ char *kernel_type;
unsigned long data_page;
- unsigned long dev_page;
+ char *kernel_dev;
char *dir_page;
int retval;
- retval = copy_mount_options (type, &type_page);
+ retval = copy_mount_string(type, &kernel_type);
if (retval < 0)
goto out;
@@ -783,38 +783,38 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
if (IS_ERR(dir_page))
goto out1;
- retval = copy_mount_options (dev_name, &dev_page);
+ retval = copy_mount_string(dev_name, &kernel_dev);
if (retval < 0)
goto out2;
- retval = copy_mount_options (data, &data_page);
+ retval = copy_mount_options(data, &data_page);
if (retval < 0)
goto out3;
retval = -EINVAL;
- if (type_page && data_page) {
- if (!strcmp((char *)type_page, SMBFS_NAME)) {
+ if (kernel_type && data_page) {
+ if (!strcmp(kernel_type, SMBFS_NAME)) {
do_smb_super_data_conv((void *)data_page);
- } else if (!strcmp((char *)type_page, NCPFS_NAME)) {
+ } else if (!strcmp(kernel_type, NCPFS_NAME)) {
do_ncp_super_data_conv((void *)data_page);
- } else if (!strcmp((char *)type_page, NFS4_NAME)) {
+ } else if (!strcmp(kernel_type, NFS4_NAME)) {
if (do_nfs4_super_data_conv((void *) data_page))
goto out4;
}
}
- retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
+ retval = do_mount(kernel_dev, dir_page, kernel_type,
flags, (void*)data_page);
out4:
free_page(data_page);
out3:
- free_page(dev_page);
+ kfree(kernel_dev);
out2:
putname(dir_page);
out1:
- free_page(type_page);
+ kfree(kernel_type);
out:
return retval;
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 240cef14fe58..70736eb4b516 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid)
{
struct connection *con;
+ /* with sctp there's no connecting without sending */
+ if (dlm_config.ci_protocol != 0)
+ return 0;
+
if (nodeid == dlm_our_nodeid())
return 0;
@@ -455,9 +459,9 @@ static void process_sctp_notification(struct connection *con,
int prim_len, ret;
int addr_len;
struct connection *new_con;
- struct file *file;
sctp_peeloff_arg_t parg;
int parglen = sizeof(parg);
+ int err;
/*
* We get this before any data for an association.
@@ -512,19 +516,22 @@ static void process_sctp_notification(struct connection *con,
ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
SCTP_SOCKOPT_PEELOFF,
(void *)&parg, &parglen);
- if (ret) {
+ if (ret < 0) {
log_print("Can't peel off a socket for "
- "connection %d to node %d: err=%d\n",
+ "connection %d to node %d: err=%d",
parg.associd, nodeid, ret);
+ return;
+ }
+ new_con->sock = sockfd_lookup(parg.sd, &err);
+ if (!new_con->sock) {
+ log_print("sockfd_lookup error %d", err);
+ return;
}
- file = fget(parg.sd);
- new_con->sock = SOCKET_I(file->f_dentry->d_inode);
add_sock(new_con->sock, new_con);
- fput(file);
- put_unused_fd(parg.sd);
+ sockfd_put(new_con->sock);
- log_print("got new/restarted association %d nodeid %d",
- (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+ log_print("connecting to %d sctp association %d",
+ nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
@@ -837,8 +844,6 @@ static void sctp_init_assoc(struct connection *con)
if (con->retries++ > MAX_CONNECT_RETRIES)
return;
- log_print("Initiating association with node %d", con->nodeid);
-
if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
log_print("no address for nodeid %d", con->nodeid);
return;
@@ -855,11 +860,14 @@ static void sctp_init_assoc(struct connection *con)
outmessage.msg_flags = MSG_EOR;
spin_lock(&con->writequeue_lock);
- e = list_entry(con->writequeue.next, struct writequeue_entry,
- list);
- BUG_ON((struct list_head *) e == &con->writequeue);
+ if (list_empty(&con->writequeue)) {
+ spin_unlock(&con->writequeue_lock);
+ log_print("writequeue empty for nodeid %d", con->nodeid);
+ return;
+ }
+ e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
spin_unlock(&con->writequeue_lock);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb7913447..31f4b0e6d72c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
}
int drop_caches_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
if (write) {
if (sysctl_drop_caches & 1)
drop_pagecache();
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 0c754e64232b..1cd6d9d3e29a 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,9 @@
config ECRYPT_FS
tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && KEYS && CRYPTO && NET
+ depends on EXPERIMENTAL && KEYS && CRYPTO
+ select CRYPTO_ECB
+ select CRYPTO_CBC
+ select CRYPTO_MD5
help
Encrypted filesystem that operates on the VFS layer. See
<file:Documentation/filesystems/ecryptfs.txt> to learn more about
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b91851f1cda3..fbb6e5eed697 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -245,13 +245,11 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
crypto_free_blkcipher(crypt_stat->tfm);
if (crypt_stat->hash_tfm)
crypto_free_hash(crypt_stat->hash_tfm);
- mutex_lock(&crypt_stat->keysig_list_mutex);
list_for_each_entry_safe(key_sig, key_sig_tmp,
&crypt_stat->keysig_list, crypt_stat_list) {
list_del(&key_sig->crypt_stat_list);
kmem_cache_free(ecryptfs_key_sig_cache, key_sig);
}
- mutex_unlock(&crypt_stat->keysig_list_mutex);
memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
}
@@ -511,13 +509,14 @@ int ecryptfs_encrypt_page(struct page *page)
+ extent_offset), crypt_stat);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
offset, crypt_stat->extent_size);
- if (rc) {
+ if (rc < 0) {
ecryptfs_printk(KERN_ERR, "Error attempting "
"to write lower page; rc = [%d]"
"\n", rc);
goto out;
}
}
+ rc = 0;
out:
if (enc_extent_page) {
kunmap(enc_extent_page);
@@ -633,7 +632,7 @@ int ecryptfs_decrypt_page(struct page *page)
rc = ecryptfs_read_lower(enc_extent_virt, offset,
crypt_stat->extent_size,
ecryptfs_inode);
- if (rc) {
+ if (rc < 0) {
ecryptfs_printk(KERN_ERR, "Error attempting "
"to read lower page; rc = [%d]"
"\n", rc);
@@ -797,6 +796,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
kfree(full_alg_name);
if (IS_ERR(crypt_stat->tfm)) {
rc = PTR_ERR(crypt_stat->tfm);
+ crypt_stat->tfm = NULL;
ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
"Error initializing cipher [%s]\n",
crypt_stat->cipher);
@@ -925,7 +925,9 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
struct ecryptfs_global_auth_tok *global_auth_tok;
int rc = 0;
+ mutex_lock(&crypt_stat->keysig_list_mutex);
mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+
list_for_each_entry(global_auth_tok,
&mount_crypt_stat->global_auth_tok_list,
mount_crypt_stat_list) {
@@ -934,13 +936,13 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
if (rc) {
printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
- mutex_unlock(
- &mount_crypt_stat->global_auth_tok_list_mutex);
goto out;
}
}
- mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+
out:
+ mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+ mutex_unlock(&crypt_stat->keysig_list_mutex);
return rc;
}
@@ -1212,14 +1214,15 @@ int ecryptfs_read_and_validate_header_region(char *data,
crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
ecryptfs_inode);
- if (rc) {
+ if (rc < 0) {
printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
__func__, rc);
goto out;
}
if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
rc = -EINVAL;
- }
+ } else
+ rc = 0;
out:
return rc;
}
@@ -1314,10 +1317,11 @@ ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
0, virt_len);
- if (rc)
+ if (rc < 0)
printk(KERN_ERR "%s: Error attempting to write header "
- "information to lower file; rc = [%d]\n", __func__,
- rc);
+ "information to lower file; rc = [%d]\n", __func__, rc);
+ else
+ rc = 0;
return rc;
}
@@ -1597,7 +1601,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
}
rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
ecryptfs_inode);
- if (!rc)
+ if (rc >= 0)
rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
ecryptfs_dentry,
ECRYPTFS_VALIDATE_HEADER_SIZE);
@@ -1702,7 +1706,7 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
} else {
printk(KERN_ERR "%s: No support for requested filename "
"encryption method in this release\n", __func__);
- rc = -ENOTSUPP;
+ rc = -EOPNOTSUPP;
goto out;
}
out:
@@ -1763,7 +1767,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
if (IS_ERR(*key_tfm)) {
rc = PTR_ERR(*key_tfm);
printk(KERN_ERR "Unable to allocate crypto cipher with name "
- "[%s]; rc = [%d]\n", cipher_name, rc);
+ "[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
@@ -1776,7 +1780,8 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
if (rc) {
printk(KERN_ERR "Error attempting to set key of size [%zd] for "
- "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
+ "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
+ rc);
rc = -EINVAL;
goto out;
}
@@ -2166,7 +2171,7 @@ int ecryptfs_encrypt_and_encode_filename(
(*encoded_name)[(*encoded_name_size)] = '\0';
(*encoded_name_size)++;
} else {
- rc = -ENOTSUPP;
+ rc = -EOPNOTSUPP;
}
if (rc) {
printk(KERN_ERR "%s: Error attempting to encode "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 2f0945d63297..056fed62d0de 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -476,6 +476,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
struct dentry *lower_dir_dentry;
+ dget(lower_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
rc = vfs_unlink(lower_dir_inode, lower_dentry);
if (rc) {
@@ -489,6 +490,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
d_drop(dentry);
out_unlock:
unlock_dir(lower_dir_dentry);
+ dput(lower_dentry);
return rc;
}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 259525c9abb8..a0a7847567e9 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -416,7 +416,9 @@ ecryptfs_find_global_auth_tok_for_sig(
&mount_crypt_stat->global_auth_tok_list,
mount_crypt_stat_list) {
if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
- (*global_auth_tok) = walker;
+ rc = key_validate(walker->global_auth_tok_key);
+ if (!rc)
+ (*global_auth_tok) = walker;
goto out;
}
}
@@ -612,7 +614,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
}
/* TODO: Support other key modules than passphrase for
* filename encryption */
- BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+ if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
+ rc = -EOPNOTSUPP;
+ printk(KERN_INFO "%s: Filename encryption only supports "
+ "password tokens\n", __func__);
+ goto out_free_unlock;
+ }
sg_init_one(
&s->hash_sg,
(u8 *)s->auth_tok->token.password.session_key_encryption_key,
@@ -910,7 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
}
/* TODO: Support other key modules than passphrase for
* filename encryption */
- BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+ if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
+ rc = -EOPNOTSUPP;
+ printk(KERN_INFO "%s: Filename encryption only supports "
+ "password tokens\n", __func__);
+ goto out_free_unlock;
+ }
rc = crypto_blkcipher_setkey(
s->desc.tfm,
s->auth_tok->token.password.session_key_encryption_key,
@@ -1316,8 +1328,10 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
rc = -EINVAL;
goto out_free;
}
- ecryptfs_cipher_code_to_string(crypt_stat->cipher,
- (u16)data[(*packet_size)]);
+ rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher,
+ (u16)data[(*packet_size)]);
+ if (rc)
+ goto out_free;
/* A little extra work to differentiate among the AES key
* sizes; see RFC2440 */
switch(data[(*packet_size)++]) {
@@ -1328,7 +1342,9 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
crypt_stat->key_size =
(*new_auth_tok)->session_key.encrypted_key_size;
}
- ecryptfs_init_crypt_ctx(crypt_stat);
+ rc = ecryptfs_init_crypt_ctx(crypt_stat);
+ if (rc)
+ goto out_free;
if (unlikely(data[(*packet_size)++] != 0x03)) {
printk(KERN_WARNING "Only S2K ID 3 is currently supported\n");
rc = -ENOSYS;
@@ -2366,21 +2382,18 @@ struct kmem_cache *ecryptfs_key_sig_cache;
int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
{
struct ecryptfs_key_sig *new_key_sig;
- int rc = 0;
new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
if (!new_key_sig) {
- rc = -ENOMEM;
printk(KERN_ERR
"Error allocating from ecryptfs_key_sig_cache\n");
- goto out;
+ return -ENOMEM;
}
memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
- mutex_lock(&crypt_stat->keysig_list_mutex);
+ /* Caller must hold keysig_list_mutex */
list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
- mutex_unlock(&crypt_stat->keysig_list_mutex);
-out:
- return rc;
+
+ return 0;
}
struct kmem_cache *ecryptfs_global_auth_tok_cache;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index c6d7a4d748a0..e14cf7e588db 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -136,6 +136,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
const struct cred *cred)
{
struct ecryptfs_open_req *req;
+ int flags = O_LARGEFILE;
int rc = 0;
/* Corresponding dput() and mntput() are done when the
@@ -143,10 +144,14 @@ int ecryptfs_privileged_open(struct file **lower_file,
* destroyed. */
dget(lower_dentry);
mntget(lower_mnt);
- (*lower_file) = dentry_open(lower_dentry, lower_mnt,
- (O_RDWR | O_LARGEFILE), cred);
+ flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR;
+ (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred);
if (!IS_ERR(*lower_file))
goto out;
+ if (flags & O_RDONLY) {
+ rc = PTR_ERR((*lower_file));
+ goto out;
+ }
req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
if (!req) {
rc = -ENOMEM;
@@ -180,21 +185,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
__func__);
goto out_unlock;
}
- if (IS_ERR(*req->lower_file)) {
+ if (IS_ERR(*req->lower_file))
rc = PTR_ERR(*req->lower_file);
- dget(lower_dentry);
- mntget(lower_mnt);
- (*lower_file) = dentry_open(lower_dentry, lower_mnt,
- (O_RDONLY | O_LARGEFILE), cred);
- if (IS_ERR(*lower_file)) {
- rc = PTR_ERR(*req->lower_file);
- (*lower_file) = NULL;
- printk(KERN_WARNING "%s: Error attempting privileged "
- "open of lower file with either RW or RO "
- "perms; rc = [%d]. Giving up.\n",
- __func__, rc);
- }
- }
out_unlock:
mutex_unlock(&req->mux);
out_free:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f0aa9883c28..c6ac85d6c701 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
#include <linux/key.h>
#include <linux/parser.h>
#include <linux/fs_stack.h>
+#include <linux/ima.h>
#include "ecryptfs_kernel.h"
/**
@@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
const struct cred *cred = current_cred();
struct ecryptfs_inode_info *inode_info =
ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+ int opened_lower_file = 0;
int rc = 0;
mutex_lock(&inode_info->lower_file_mutex);
@@ -129,15 +131,17 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
rc = ecryptfs_privileged_open(&inode_info->lower_file,
lower_dentry, lower_mnt, cred);
- if (rc || IS_ERR(inode_info->lower_file)) {
+ if (rc) {
printk(KERN_ERR "Error opening lower persistent file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
"rc = [%d]\n", lower_dentry, lower_mnt, rc);
- rc = PTR_ERR(inode_info->lower_file);
inode_info->lower_file = NULL;
- }
+ } else
+ opened_lower_file = 1;
}
mutex_unlock(&inode_info->lower_file_mutex);
+ if (opened_lower_file)
+ ima_counts_get(inode_info->lower_file);
return rc;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 05772aeaa8f4..df4ce99d0597 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -396,9 +396,11 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
sizeof(u64));
kfree(file_size_virt);
- if (rc)
+ if (rc < 0)
printk(KERN_ERR "%s: Error writing file size to header; "
"rc = [%d]\n", __func__, rc);
+ else
+ rc = 0;
out:
return rc;
}
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index a137c6ea2fee..0cc4fafd6552 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -34,15 +34,14 @@
*
* Write data to the lower file.
*
- * Returns zero on success; non-zero on error
+ * Returns bytes written on success; less than zero on error
*/
int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
loff_t offset, size_t size)
{
struct ecryptfs_inode_info *inode_info;
- ssize_t octets_written;
mm_segment_t fs_save;
- int rc = 0;
+ ssize_t rc;
inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
mutex_lock(&inode_info->lower_file_mutex);
@@ -50,14 +49,9 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
inode_info->lower_file->f_pos = offset;
fs_save = get_fs();
set_fs(get_ds());
- octets_written = vfs_write(inode_info->lower_file, data, size,
- &inode_info->lower_file->f_pos);
+ rc = vfs_write(inode_info->lower_file, data, size,
+ &inode_info->lower_file->f_pos);
set_fs(fs_save);
- if (octets_written < 0) {
- printk(KERN_ERR "%s: octets_written = [%td]; "
- "expected [%td]\n", __func__, octets_written, size);
- rc = -EINVAL;
- }
mutex_unlock(&inode_info->lower_file_mutex);
mark_inode_dirty_sync(ecryptfs_inode);
return rc;
@@ -91,6 +85,8 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
+ offset_in_page);
virt = kmap(page_for_lower);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
+ if (rc > 0)
+ rc = 0;
kunmap(page_for_lower);
return rc;
}
@@ -229,30 +225,24 @@ out:
* Read @size bytes of data at byte offset @offset from the lower
* inode into memory location @data.
*
- * Returns zero on success; non-zero on error
+ * Returns bytes read on success; 0 on EOF; less than zero on error
*/
int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
struct inode *ecryptfs_inode)
{
struct ecryptfs_inode_info *inode_info =
ecryptfs_inode_to_private(ecryptfs_inode);
- ssize_t octets_read;
mm_segment_t fs_save;
- int rc = 0;
+ ssize_t rc;
mutex_lock(&inode_info->lower_file_mutex);
BUG_ON(!inode_info->lower_file);
inode_info->lower_file->f_pos = offset;
fs_save = get_fs();
set_fs(get_ds());
- octets_read = vfs_read(inode_info->lower_file, data, size,
- &inode_info->lower_file->f_pos);
+ rc = vfs_read(inode_info->lower_file, data, size,
+ &inode_info->lower_file->f_pos);
set_fs(fs_save);
- if (octets_read < 0) {
- printk(KERN_ERR "%s: octets_read = [%td]; "
- "expected [%td]\n", __func__, octets_read, size);
- rc = -EINVAL;
- }
mutex_unlock(&inode_info->lower_file_mutex);
return rc;
}
@@ -284,6 +274,8 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
virt = kmap(page_for_ecryptfs);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
+ if (rc > 0)
+ rc = 0;
kunmap(page_for_ecryptfs);
flush_dcache_page(page_for_ecryptfs);
return rc;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 12d649602d3a..b15a43a80ab7 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -77,7 +77,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
struct ecryptfs_inode_info *inode_info;
inode_info = ecryptfs_inode_to_private(inode);
- mutex_lock(&inode_info->lower_file_mutex);
if (inode_info->lower_file) {
struct dentry *lower_dentry =
inode_info->lower_file->f_dentry;
@@ -89,7 +88,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
d_drop(lower_dentry);
}
}
- mutex_unlock(&inode_info->lower_file_mutex);
ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
}
diff --git a/fs/exec.c b/fs/exec.c
index 5c833c18d0d4..d49be6bc1793 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -63,6 +64,7 @@
int core_uses_pid;
char core_pattern[CORENAME_MAX_SIZE] = "core";
+unsigned int core_pipe_limit;
int suid_dumpable = 0;
/* The maximal length of core_pattern is also specified in sysctl.c */
@@ -1393,18 +1395,16 @@ out_ret:
return retval;
}
-int set_binfmt(struct linux_binfmt *new)
+void set_binfmt(struct linux_binfmt *new)
{
- struct linux_binfmt *old = current->binfmt;
+ struct mm_struct *mm = current->mm;
- if (new) {
- if (!try_module_get(new->module))
- return -1;
- }
- current->binfmt = new;
- if (old)
- module_put(old->module);
- return 0;
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
+
+ mm->binfmt = new;
+ if (new)
+ __module_get(new->module);
}
EXPORT_SYMBOL(set_binfmt);
@@ -1728,6 +1728,29 @@ int get_dumpable(struct mm_struct *mm)
return (ret >= 2) ? 2 : ret;
}
+static void wait_for_dump_helpers(struct file *file)
+{
+ struct pipe_inode_info *pipe;
+
+ pipe = file->f_path.dentry->d_inode->i_pipe;
+
+ pipe_lock(pipe);
+ pipe->readers++;
+ pipe->writers--;
+
+ while ((pipe->readers > 1) && (!signal_pending(current))) {
+ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ pipe_wait(pipe);
+ }
+
+ pipe->readers--;
+ pipe->writers++;
+ pipe_unlock(pipe);
+
+}
+
+
void do_coredump(long signr, int exit_code, struct pt_regs *regs)
{
struct core_state core_state;
@@ -1744,11 +1767,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
char **helper_argv = NULL;
int helper_argc = 0;
- char *delimit;
+ int dump_count = 0;
+ static atomic_t core_dump_count = ATOMIC_INIT(0);
audit_core_dumps(signr);
- binfmt = current->binfmt;
+ binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;
@@ -1799,54 +1823,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
lock_kernel();
ispipe = format_corename(corename, signr);
unlock_kernel();
- /*
- * Don't bother to check the RLIMIT_CORE value if core_pattern points
- * to a pipe. Since we're not writing directly to the filesystem
- * RLIMIT_CORE doesn't really apply, as no actual core file will be
- * created unless the pipe reader choses to write out the core file
- * at which point file size limits and permissions will be imposed
- * as it does with any other process
- */
+
if ((!ispipe) && (core_limit < binfmt->min_coredump))
goto fail_unlock;
if (ispipe) {
+ if (core_limit == 0) {
+ /*
+ * Normally core limits are irrelevant to pipes, since
+ * we're not writing to the file system, but we use
+ * core_limit of 0 here as a speacial value. Any
+ * non-zero limit gets set to RLIM_INFINITY below, but
+ * a limit of 0 skips the dump. This is a consistent
+ * way to catch recursive crashes. We can still crash
+ * if the core_pattern binary sets RLIM_CORE = !0
+ * but it runs as root, and can do lots of stupid things
+ * Note that we use task_tgid_vnr here to grab the pid
+ * of the process group leader. That way we get the
+ * right pid if a thread in a multi-threaded
+ * core_pattern process dies.
+ */
+ printk(KERN_WARNING
+ "Process %d(%s) has RLIMIT_CORE set to 0\n",
+ task_tgid_vnr(current), current->comm);
+ printk(KERN_WARNING "Aborting core\n");
+ goto fail_unlock;
+ }
+
+ dump_count = atomic_inc_return(&core_dump_count);
+ if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+ printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+ task_tgid_vnr(current), current->comm);
+ printk(KERN_WARNING "Skipping core dump\n");
+ goto fail_dropcount;
+ }
+
helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
- goto fail_unlock;
- }
- /* Terminate the string before the first option */
- delimit = strchr(corename, ' ');
- if (delimit)
- *delimit = '\0';
- delimit = strrchr(helper_argv[0], '/');
- if (delimit)
- delimit++;
- else
- delimit = helper_argv[0];
- if (!strcmp(delimit, current->comm)) {
- printk(KERN_NOTICE "Recursive core dump detected, "
- "aborting\n");
- goto fail_unlock;
+ goto fail_dropcount;
}
core_limit = RLIM_INFINITY;
/* SIGPIPE can happen, but it's just never processed */
- if (call_usermodehelper_pipe(corename+1, helper_argv, NULL,
+ if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
&file)) {
printk(KERN_INFO "Core dump to %s pipe failed\n",
corename);
- goto fail_unlock;
+ goto fail_dropcount;
}
} else
file = filp_open(corename,
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
0600);
if (IS_ERR(file))
- goto fail_unlock;
+ goto fail_dropcount;
inode = file->f_path.dentry->d_inode;
if (inode->i_nlink > 1)
goto close_fail; /* multiple links - don't dump */
@@ -1875,7 +1908,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
if (retval)
current->signal->group_exit_code |= 0x80;
close_fail:
+ if (ispipe && core_pipe_limit)
+ wait_for_dump_helpers(file);
filp_close(file, NULL);
+fail_dropcount:
+ if (dump_count)
+ atomic_dec(&core_dump_count);
fail_unlock:
if (helper_argv)
argv_free(helper_argv);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 5ab10c3bbebe..9f500dec3b59 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -214,7 +214,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
}
lock_super(sb);
- lock_kernel();
sbi = sb->s_fs_info;
fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
@@ -245,7 +244,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
out:
if (or)
osd_end_request(or);
- unlock_kernel();
unlock_super(sb);
kfree(fscb);
return ret;
@@ -268,8 +266,6 @@ static void exofs_put_super(struct super_block *sb)
int num_pend;
struct exofs_sb_info *sbi = sb->s_fs_info;
- lock_kernel();
-
if (sb->s_dirt)
exofs_write_super(sb);
@@ -286,8 +282,6 @@ static void exofs_put_super(struct super_block *sb)
osduld_put_device(sbi->s_dev);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
-
- unlock_kernel();
}
/*
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 1c1638f873a4..ade634076d0a 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = {
.writepages = ext2_writepages,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
const struct address_space_operations ext2_aops_xip = {
@@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = {
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
.migratepage = buffer_migrate_page,
+ .error_remove_page = generic_error_remove_page,
};
/*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index cd098a7b77fc..acf1b1423327 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1830,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext3_writeback_aops = {
@@ -1845,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext3_journalled_aops = {
@@ -1859,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = {
.invalidatepage = ext3_invalidatepage,
.releasepage = ext3_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void ext3_set_aops(struct inode *inode)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 72743d360509..7a520a862f49 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2321,7 +2321,18 @@ static int ext3_commit_super(struct super_block *sb,
if (!sbh)
return error;
- es->s_wtime = cpu_to_le32(get_seconds());
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2d..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
If unsure, say N.
-config EXT4DEV_COMPAT
- bool "Enable ext4dev compatibility"
- depends on EXT4_FS
- help
- Starting with 2.6.28, the name of the ext4 filesystem was
- renamed from ext4dev to ext4. Unfortunately there are some
- legacy userspace programs (such as klibc's fstype) have
- "ext4dev" hardcoded.
-
- To enable backwards compatibility so that systems that are
- still expecting to mount ext4 filesystems using ext4dev,
- choose Y here. This feature will go away by 2.6.31, so
- please arrange to get your userspace programs fixed!
-
config EXT4_FS_XATTR
bool "Ext4 extended attributes"
depends on EXT4_FS
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..984ca0cb38c3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
+/*
+ * Flags used in mballoc's allocation_context flags field.
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
int pages_written;
int retval;
};
+#define DIO_AIO_UNWRITTEN 0x1
+typedef struct ext4_io_end {
+ struct list_head list; /* per-file finished AIO list */
+ struct inode *inode; /* file being written to */
+ unsigned int flag; /* unwritten or not */
+ int error; /* I/O error code */
+ ext4_lblk_t offset; /* offset in the file */
+ size_t size; /* size of the extent */
+ struct work_struct work; /* data work queue */
+} ext4_io_end_t;
/*
* Special inodes numbers
@@ -347,7 +363,16 @@ struct ext4_new_group_data {
/* Call ext4_da_update_reserve_space() after successfully
allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
-
+ /* caller is from the direct IO path, request to creation of an
+ unitialized extents if not allocated, split the uninitialized
+ extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_DIO 0x0010
+#define EXT4_GET_BLOCKS_CONVERT 0x0020
+#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Convert extent to initialized after direct IO complete */
+#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
+ EXT4_GET_BLOCKS_DIO_CREATE_EXT)
/*
* ioctl commands
@@ -500,8 +525,8 @@ struct move_extent {
static inline __le32 ext4_encode_extra_time(struct timespec *time)
{
return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
- time->tv_sec >> 32 : 0) |
- ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+ (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+ ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
}
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
if (sizeof(time->tv_sec) > 4)
time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
<< 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -672,6 +697,11 @@ struct ext4_inode_info {
__u16 i_extra_isize;
spinlock_t i_block_reservation_lock;
+
+ /* completed async DIOs that might need unwritten extents handling */
+ struct list_head i_aio_dio_complete_list;
+ /* current io_end structure for async DIO write*/
+ ext4_io_end_t *cur_aio_dio;
};
/*
@@ -942,18 +972,11 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
+ unsigned int s_max_writeback_mb_bump;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
- /* history to debug policy */
- struct ext4_mb_history *s_mb_history;
- int s_mb_history_cur;
- int s_mb_history_max;
- int s_mb_history_num;
- spinlock_t s_mb_history_lock;
- int s_mb_history_filter;
-
/* stats for buddy allocator */
spinlock_t s_mb_pa_lock;
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -980,6 +1003,9 @@ struct ext4_sb_info {
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+
+ /* workqueue for dio unwritten */
+ struct workqueue_struct *dio_unwritten_wq;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1397,7 +1423,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t ext4_get_reserved_space(struct inode *inode);
-
+extern int flush_aio_dio_completed_IO(struct inode *inode);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1725,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e6..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
ext_prepare_callback, void *);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..a2865980342f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
-#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+/* Note: Do not use this for NULL handles. This is only to determine if
+ * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
- if (handle == EXT4_NOJOURNAL_HANDLE)
+ if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
return 0;
return 1;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a3832577923..10539e364283 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
* insert new index [@logical;@ptr] into the block at @curp;
* check where to insert: before @curp or after @curp
*/
-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
struct ext4_ext_path *curp,
int logical, ext4_fsblk_t ptr)
{
@@ -1586,7 +1586,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ struct ext4_extent *newext, int flag)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
BUG_ON(path[depth].p_hdr == NULL);
/* try to insert block into found extent and return */
- if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+ if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
merge:
/* try to merge extents to the right */
- ext4_ext_try_to_merge(inode, path, nearex);
+ if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */
@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
*/
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
printk(KERN_INFO "EXT4-fs: file extents enabled");
#ifdef AGGRESSIVE_TEST
printk(", aggressive tests");
@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
printk(", stats");
#endif
printk("\n");
+#endif
#ifdef EXTENTS_STATS
spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
}
#define EXT4_EXT_ZERO_LEN 7
-
/*
* This function is called by ext4_ext_get_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex3->ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path,
+ ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex3, newblock + max_blocks);
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2757,7 +2761,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2785,6 +2789,324 @@ fix_extent_len:
}
/*
+ * This function is called by ext4_ext_get_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitized extent may result in splitting the uninitialized
+ * extent into multiple /intialized unintialized extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be uninitialized
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitilized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t iblock,
+ unsigned int max_blocks,
+ int flags)
+{
+ struct ext4_extent *ex, newex, orig_ex;
+ struct ext4_extent *ex1 = NULL;
+ struct ext4_extent *ex2 = NULL;
+ struct ext4_extent *ex3 = NULL;
+ struct ext4_extent_header *eh;
+ ext4_lblk_t ee_block;
+ unsigned int allocated, ee_len, depth;
+ ext4_fsblk_t newblock;
+ int err = 0;
+ int ret = 0;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu,"
+ "iblock %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ allocated = ee_len - (iblock - ee_block);
+ newblock = iblock - ee_block + ext_pblock(ex);
+ ex2 = ex;
+ orig_ex.ee_block = ex->ee_block;
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
+ /*
+ * if the entire unintialized extent length less than
+ * the size of extent to write, there is no need to split
+ * uninitialized extent
+ */
+ if (allocated <= max_blocks)
+ return ret;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* ex1: ee_block to iblock - 1 : uninitialized */
+ if (iblock > ee_block) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * for sanity, update the length of the ex2 extent before
+ * we insert ex3, if ex1 is NULL. This is to avoid temporary
+ * overlap of blocks.
+ */
+ if (!ex1 && allocated > max_blocks)
+ ex2->ee_len = cpu_to_le16(max_blocks);
+ /* ex3: to ee_block + ee_len : uninitialised */
+ if (allocated > max_blocks) {
+ unsigned int newdepth;
+ ex3 = &newex;
+ ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+ ext4_ext_store_pblock(ex3, newblock + max_blocks);
+ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zeroed the full extent */
+ /* blocks available from iblock */
+ return allocated;
+
+ } else if (err)
+ goto fix_extent_len;
+ /*
+ * The depth, and hence eh & ex might change
+ * as part of the insert above.
+ */
+ newdepth = ext_depth(inode);
+ /*
+ * update the extent length after successful insert of the
+ * split extent
+ */
+ orig_ex.ee_len = cpu_to_le16(ee_len -
+ ext4_ext_get_actual_len(ex3));
+ depth = newdepth;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, iblock, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ if (ex2 != &newex)
+ ex2 = ex;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ allocated = max_blocks;
+ }
+ /*
+ * If there was a change of depth as part of the
+ * insertion of ex3 above, we need to update the length
+ * of the ex1 extent again here
+ */
+ if (ex1 && ex1 != ex) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+ * uninitialised still.
+ */
+ ex2->ee_block = cpu_to_le32(iblock);
+ ext4_ext_store_pblock(ex2, newblock);
+ ex2->ee_len = cpu_to_le16(allocated);
+ ext4_ext_mark_uninitialized(ex2);
+ if (ex2 != ex)
+ goto insert;
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ ext_debug("out here\n");
+ goto out;
+insert:
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zero out the first half */
+ return allocated;
+ } else if (err)
+ goto fix_extent_len;
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err ? err : allocated;
+
+fix_extent_len:
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_mark_uninitialized(ex);
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ struct ext4_extent_header *eh;
+ int depth;
+ int err = 0;
+ int ret = 0;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as initialized */
+ ext4_ext_mark_initialized(ex);
+
+ /*
+ * We have to see if it can be merged with the extent
+ * on the left.
+ */
+ if (ex > EXT_FIRST_EXTENT(eh)) {
+ /*
+ * To merge left, pass "ex - 1" to try_to_merge(),
+ * since it merges towards right _only_.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex - 1);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ ex--;
+ }
+ }
+ /*
+ * Try to Merge towards right.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ }
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, unsigned int max_blocks,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, struct buffer_head *bh_result,
+ ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+ "block %llu, max_blocks %u, flags %d, allocated %u",
+ inode->i_ino, (unsigned long long)iblock, max_blocks,
+ flags, allocated);
+ ext4_ext_show_leaf(inode, path);
+
+ /* DIO get_block() before submit the IO, split the extent */
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ ret = ext4_split_unwritten_extents(handle,
+ inode, path, iblock,
+ max_blocks, flags);
+ /* flag the io_end struct that we need convert when IO done */
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ goto out;
+ }
+ /* DIO end_io complete, convert the filled extent to written */
+ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+ ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ path);
+ goto out2;
+ }
+ /* buffered IO case */
+ /*
+ * repeat fallocate creation request
+ * we already have an unwritten extent
+ */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ goto map_out;
+
+ /* buffered READ or buffered write_begin() lookup */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * We have blocks reserved already. We
+ * return allocated blocks so that delalloc
+ * won't do block reservation for us. But
+ * the buffer head will be unmapped so that
+ * a read from the block returns 0s.
+ */
+ set_buffer_unwritten(bh_result);
+ goto out1;
+ }
+
+ /* buffered write, writepage time, convert*/
+ ret = ext4_ext_convert_to_initialized(handle, inode,
+ path, iblock,
+ max_blocks);
+out:
+ if (ret <= 0) {
+ err = ret;
+ goto out2;
+ } else
+ allocated = ret;
+ set_buffer_new(bh_result);
+map_out:
+ set_buffer_mapped(bh_result);
+out1:
+ if (allocated > max_blocks)
+ allocated = max_blocks;
+ ext4_ext_show_leaf(inode, path);
+ bh_result->b_bdev = inode->i_sb->s_bdev;
+ bh_result->b_blocknr = newblock;
+out2:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ return err ? err : allocated;
+}
+/*
* Block allocation/map/preallocation routine for extents based files
*
*
@@ -2814,6 +3136,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2889,33 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_EXT_CACHE_EXTENT);
goto out;
}
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
- goto out;
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- if (allocated > max_blocks)
- allocated = max_blocks;
- /*
- * We have blocks reserved already. We
- * return allocated blocks so that delalloc
- * won't do block reservation for us. But
- * the buffer head will be unmapped so that
- * a read from the block returns 0s.
- */
- set_buffer_unwritten(bh_result);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
- goto out2;
- }
-
- ret = ext4_ext_convert_to_initialized(handle, inode,
- path, iblock,
- max_blocks);
- if (ret <= 0) {
- err = ret;
- goto out2;
- } else
- allocated = ret;
- goto outnew;
+ ret = ext4_ext_handle_uninitialized_extents(handle,
+ inode, iblock, max_blocks, path,
+ flags, allocated, bh_result, newblock);
+ return ret;
}
}
@@ -2986,9 +3286,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(ar.len);
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
+ /* Mark uninitialized */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ /*
+ * io_end structure was created for every async
+ * direct IO write to the middle of the file.
+ * To avoid unecessary convertion for every aio dio rewrite
+ * to the mid of file, here we flag the IO that is really
+ * need the convertion.
+ *
+ */
+ if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ io->flag = DIO_AIO_UNWRITTEN;
+ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
@@ -3002,7 +3314,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
-outnew:
set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3512,63 @@ retry:
}
/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ ext4_lblk_t block;
+ unsigned int max_blocks;
+ int ret = 0;
+ int ret2 = 0;
+ struct buffer_head map_bh;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ block = offset >> blkbits;
+ /*
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
+ */
+ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+ - block;
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ while (ret >= 0 && ret < max_blocks) {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ map_bh.b_state = 0;
+ ret = ext4_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+ if (ret <= 0) {
+ WARN_ON(ret <= 0);
+ printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ "returned error inode#%lu, block=%u, "
+ "max_blocks=%u", __func__,
+ inode->i_ino, block, max_blocks);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2 )
+ break;
+ }
+ return ret > 0 ? ret2 : ret;
+}
+/*
* Callback function called for each extent to gather FIEMAP information.
*/
static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5ca3eca70a1e..9630583cef28 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -81,7 +81,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
-static struct vm_operations_struct ext4_file_vm_ops = {
+static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = ext4_page_mkwrite,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b512..2b1531266ee2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
+ *
+ * i_mutex lock is held when entering and exiting this function
*/
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
trace_ext4_sync_file(file, dentry, datasync);
+ ret = flush_aio_dio_completed_IO(inode);
+ if (ret < 0)
+ goto out;
/*
* data=writeback:
* The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3a798737e305..5c5bc5dafff8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -1145,6 +1146,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
}
/*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+ unsigned int max_pages)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ struct pagevec pvec;
+ pgoff_t num = 0;
+ int i, nr_pages, done = 0;
+
+ if (max_pages == 0)
+ return 0;
+ pagevec_init(&pvec, 0);
+ while (!done) {
+ index = idx;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ (pgoff_t)PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ lock_page(page);
+ if (unlikely(page->mapping != mapping) ||
+ !PageDirty(page) ||
+ PageWriteback(page) ||
+ page->index != idx) {
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_delay(bh) &&
+ !buffer_unwritten(bh))
+ done = 1;
+ bh = bh->b_this_page;
+ } while (!done && (bh != head));
+ }
+ unlock_page(page);
+ if (done)
+ break;
+ idx++;
+ num++;
+ if (num >= max_pages)
+ break;
+ }
+ pagevec_release(&pvec);
+ }
+ return num;
+}
+
+/*
* The ext4_get_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -1175,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
clear_buffer_mapped(bh);
clear_buffer_unwritten(bh);
+ ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, max_blocks,
+ (unsigned long)block);
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -1796,11 +1858,11 @@ repeat:
if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ vfs_dq_release_reservation_block(inode, total);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
- vfs_dq_release_reservation_block(inode, total);
return -ENOSPC;
}
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -2092,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- printk(KERN_EMERG "Total free blocks count %lld\n",
- ext4_count_free_blocks(inode->i_sb));
- printk(KERN_EMERG "Free/Dirty block details\n");
- printk(KERN_EMERG "free_blocks=%lld\n",
- (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
- printk(KERN_EMERG "dirty_blocks=%lld\n",
- (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
- printk(KERN_EMERG "Block reservation details\n");
- printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
- EXT4_I(inode)->i_reserved_data_blocks);
- printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
- EXT4_I(inode)->i_reserved_meta_blocks);
+ printk(KERN_CRIT "Total free blocks count %lld\n",
+ ext4_count_free_blocks(inode->i_sb));
+ printk(KERN_CRIT "Free/Dirty block details\n");
+ printk(KERN_CRIT "free_blocks=%lld\n",
+ (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+ printk(KERN_CRIT "dirty_blocks=%lld\n",
+ (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ printk(KERN_CRIT "Block reservation details\n");
+ printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
+ EXT4_I(inode)->i_reserved_data_blocks);
+ printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+ EXT4_I(inode)->i_reserved_meta_blocks);
return;
}
@@ -2189,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* writepage and writepages will again try to write
* the same.
*/
- printk(KERN_EMERG "%s block allocation failed for inode %lu "
- "at logical offset %llu with max blocks "
- "%zd with error %d\n",
- __func__, mpd->inode->i_ino,
- (unsigned long long)next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- printk(KERN_EMERG "This should not happen.!! "
- "Data will be lost\n");
+ ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+ "delayed block allocation failed for inode %lu at "
+ "logical offset %llu with max blocks %zd with "
+ "error %d\n", mpd->inode->i_ino,
+ (unsigned long long) next,
+ mpd->b_size >> mpd->inode->i_blkbits, err);
+ printk(KERN_CRIT "This should not happen!! "
+ "Data will be lost\n");
if (err == -ENOSPC) {
ext4_print_free_blocks(mpd->inode);
}
@@ -2743,8 +2805,10 @@ static int ext4_da_writepages(struct address_space *mapping,
int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
+ unsigned int max_pages;
int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0, nr_to_writebump = 0;
+ int needed_blocks, ret = 0;
+ long desired_nr_to_write, nr_to_writebump = 0;
loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2771,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping,
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
- /*
- * Make sure nr_to_write is >= sbi->s_mb_stream_request
- * This make sure small files blocks are allocated in
- * single attempt. This ensure that small files
- * get less fragmented.
- */
- if (wbc->nr_to_write < sbi->s_mb_stream_request) {
- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
- wbc->nr_to_write = sbi->s_mb_stream_request;
- }
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
@@ -2795,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping,
} else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ /*
+ * This works around two forms of stupidity. The first is in
+ * the writeback code, which caps the maximum number of pages
+ * written to be 1024 pages. This is wrong on multiple
+ * levels; different architectues have a different page size,
+ * which changes the maximum amount of data which gets
+ * written. Secondly, 4 megabytes is way too small. XFS
+ * forces this value to be 16 megabytes by multiplying
+ * nr_to_write parameter by four, and then relies on its
+ * allocator to allocate larger extents to make them
+ * contiguous. Unfortunately this brings us to the second
+ * stupidity, which is that ext4's mballoc code only allocates
+ * at most 2048 blocks. So we force contiguous writes up to
+ * the number of dirty blocks in the inode, or
+ * sbi->max_writeback_mb_bump whichever is smaller.
+ */
+ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+ if (!range_cyclic && range_whole)
+ desired_nr_to_write = wbc->nr_to_write * 8;
+ else
+ desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+ max_pages);
+ if (desired_nr_to_write > max_pages)
+ desired_nr_to_write = max_pages;
+
+ if (wbc->nr_to_write < desired_nr_to_write) {
+ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nr_to_write;
+ }
+
mpd.wbc = wbc;
mpd.inode = mapping->host;
@@ -2822,10 +2906,9 @@ retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- printk(KERN_CRIT "%s: jbd2_start: "
+ ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d\n", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- dump_stack();
goto out_writepages;
}
@@ -2897,9 +2980,10 @@ retry:
goto retry;
}
if (pages_skipped != wbc->pages_skipped)
- printk(KERN_EMERG "This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
- __func__, wbc->nr_to_write, ret);
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
/* Update index */
index += pages_written;
@@ -2914,7 +2998,8 @@ retry:
out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
- wbc->nr_to_write -= nr_to_writebump;
+ if (wbc->nr_to_write > nr_to_writebump)
+ wbc->nr_to_write -= nr_to_writebump;
wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
@@ -3272,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
@@ -3280,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
@@ -3291,6 +3378,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
ssize_t ret;
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
+ int retries = 0;
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -3313,9 +3401,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
}
}
+retry:
ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (orphan) {
int err;
@@ -3354,6 +3445,359 @@ out:
return ret;
}
+/* Maximum number of blocks we map for direct IO at once. */
+
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
+ ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /*
+ * DIO VFS code passes create = 0 flag for write to
+ * the middle of file. It does this to avoid block
+ * allocation for holes, to prevent expose stale data
+ * out when there is parallel buffered read (which does
+ * not hold the i_mutex lock) while direct IO write has
+ * not completed. DIO request on holes finally falls back
+ * to buffered IO for this reason.
+ *
+ * For ext4 extent based file, since we support fallocate,
+ * new allocated extent as uninitialized, for holes, we
+ * could fallocate blocks for holes, thus parallel
+ * buffered IO read will zero out the page when read on
+ * a hole while parallel DIO write to the hole has not completed.
+ *
+ * when we come here, we know it's a direct IO write to
+ * to the middle of file (<i_size)
+ * so it's safe to override the create flag from VFS.
+ */
+ create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
+
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+ BUG_ON(!io);
+ iput(io->inode);
+ kfree(io);
+}
+static void dump_aio_dio_list(struct inode * inode)
+{
+#ifdef EXT4_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+{
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ size_t size = io->size;
+ int ret = 0;
+
+ ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
+
+ if (list_empty(&io->list))
+ return ret;
+
+ if (io->flag != DIO_AIO_UNWRITTEN)
+ return ret;
+
+ if (offset + size <= i_size_read(inode))
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+ if (ret < 0) {
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+ "extents to written extents, error is %d"
+ " io is still on inode %lu aio dio list\n",
+ __func__, ret, inode->i_ino);
+ return ret;
+ }
+
+ /* clear the DIO AIO unwritten flag */
+ io->flag = 0;
+ return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_aio_dio_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret >= 0) {
+ if (!list_empty(&io->list))
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ mutex_unlock(&inode->i_mutex);
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When AIO DIO IO is completed, the work to convert unwritten
+ * extents to written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of completed AIO from DIO path
+ * that might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents to written.
+ */
+int flush_aio_dio_completed_IO(struct inode *inode)
+{
+ ext4_io_end_t *io;
+ int ret = 0;
+ int ret2 = 0;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+ return ret;
+
+ dump_aio_dio_list(inode);
+ while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+ ext4_io_end_t, list);
+ /*
+ * Calling ext4_end_aio_dio_nolock() to convert completed
+ * IO to written.
+ *
+ * When ext4_sync_file() is called, run_queue() may already
+ * about to flush the work corresponding to this io structure.
+ * It will be upset if it founds the io structure related
+ * to the work-to-be schedule is freed.
+ *
+ * Thus we need to keep the io structure still valid here after
+ * convertion finished. The io structure has a flag to
+ * avoid double converting from both fsync and background work
+ * queue work.
+ */
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret < 0)
+ ret2 = ret;
+ else
+ list_del_init(&io->list);
+ }
+ return (ret2 < 0) ? ret2 : 0;
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+{
+ ext4_io_end_t *io = NULL;
+
+ io = kmalloc(sizeof(*io), GFP_NOFS);
+
+ if (io) {
+ igrab(inode);
+ io->inode = inode;
+ io->flag = 0;
+ io->offset = 0;
+ io->size = 0;
+ io->error = 0;
+ INIT_WORK(&io->work, ext4_end_aio_dio_work);
+ INIT_LIST_HEAD(&io->list);
+ }
+
+ return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+
+ ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ return;
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (io_end->flag != DIO_AIO_UNWRITTEN){
+ ext4_free_io_end(io_end);
+ iocb->private = NULL;
+ return;
+ }
+
+ io_end->offset = offset;
+ io_end->size = size;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+ /* Add the io_end to per-inode completed aio dio list*/
+ list_add_tail(&io_end->list,
+ &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+ iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+ size_t count = iov_length(iov, nr_segs);
+
+ loff_t final_size = offset + count;
+ if (rw == WRITE && final_size <= inode->i_size) {
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as uninitialized
+ * to prevent paralel buffered read to expose the stale data
+ * before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+ * for non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * for async DIO, the conversion needs to be defered when
+ * the IO is completed. The ext4 end_io callback function
+ * will be called to take care of the conversion work.
+ * Here for async case, we allocate an io_end structure to
+ * hook to the iocb.
+ */
+ iocb->private = NULL;
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ if (!is_sync_kiocb(iocb)) {
+ iocb->private = ext4_init_io_end(inode);
+ if (!iocb->private)
+ return -ENOMEM;
+ /*
+ * we save the io structure for current async
+ * direct IO, so that later ext4_get_blocks()
+ * could flag the io structure whether there
+ * is a unwritten extents needs to be converted
+ * when IO is completed.
+ */
+ EXT4_I(inode)->cur_aio_dio = iocb->private;
+ }
+
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_dio_write,
+ ext4_end_io_dio);
+ if (iocb->private)
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ /*
+ * The io_end structure takes a reference to the inode,
+ * that structure needs to be destroyed and the
+ * reference to the inode need to be dropped, when IO is
+ * complete, even with 0 byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will be
+ * desctroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since
+ * VFS direct IO won't invoke the end_io call back function,
+ * we need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0)
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the convertion right here
+ */
+ ret = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ return ret;
+ }
+
+ /* for write the the end of file case, we fall back to old way */
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3386,6 +3830,7 @@ static const struct address_space_operations ext4_ordered_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_writeback_aops = {
@@ -3401,6 +3846,7 @@ static const struct address_space_operations ext4_writeback_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_journalled_aops = {
@@ -3415,6 +3861,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_da_aops = {
@@ -3431,6 +3878,7 @@ static const struct address_space_operations ext4_da_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void ext4_set_aops(struct inode *inode)
@@ -4547,8 +4995,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
static int ext4_do_update_inode(handle_t *handle,
struct inode *inode,
- struct ext4_iloc *iloc,
- int do_sync)
+ struct ext4_iloc *iloc)
{
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4649,22 +5096,10 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
- /*
- * If we're not using a journal and we were called from
- * ext4_write_inode() to sync the inode (making do_sync true),
- * we can just use sync_dirty_buffer() directly to do our dirty
- * work. Testing s_journal here is a bit redundant but it's
- * worth it to avoid potential future trouble.
- */
- if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
- BUFFER_TRACE(bh, "call sync_dirty_buffer");
- sync_dirty_buffer(bh);
- } else {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!err)
- err = rc;
- }
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!err)
+ err = rc;
ei->i_state &= ~EXT4_STATE_NEW;
out_brelse:
@@ -4732,8 +5167,16 @@ int ext4_write_inode(struct inode *inode, int wait)
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
- err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
- inode, &iloc, wait);
+ if (wait)
+ sync_dirty_buffer(iloc.bh);
+ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ ext4_error(inode->i_sb, __func__,
+ "IO error syncing inode, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long)iloc.bh->b_blocknr);
+ err = -EIO;
+ }
}
return err;
}
@@ -5029,7 +5472,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
get_bh(iloc->bh);
/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
- err = ext4_do_update_inode(handle, inode, iloc, 0);
+ err = ext4_do_update_inode(handle, inode, iloc);
put_bh(iloc->bh);
return err;
}
@@ -5173,27 +5616,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
*/
void ext4_dirty_inode(struct inode *inode)
{
- handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
- if (!ext4_handle_valid(current_handle)) {
- ext4_mark_inode_dirty(current_handle, inode);
- return;
- }
-
handle = ext4_journal_start(inode, 2);
if (IS_ERR(handle))
goto out;
- if (current_handle &&
- current_handle->h_transaction != handle->h_transaction) {
- /* This task has a transaction open against a different fs */
- printk(KERN_EMERG "%s: transactions do not match!\n",
- __func__);
- } else {
- jbd_debug(5, "marking dirty. outer handle=%p\n",
- current_handle);
- ext4_mark_inode_dirty(handle, inode);
- }
+
+ ext4_mark_inode_dirty(handle, inode);
+
ext4_journal_stop(handle);
out:
return;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d605..bba12824defa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
return err;
}
-#ifdef EXT4_MB_HISTORY
-struct ext4_mb_proc_session {
- struct ext4_mb_history *history;
- struct super_block *sb;
- int start;
- int max;
-};
-
-static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
- struct ext4_mb_history *hs,
- int first)
-{
- if (hs == s->history + s->max)
- hs = s->history;
- if (!first && hs == s->history + s->start)
- return NULL;
- while (hs->orig.fe_len == 0) {
- hs++;
- if (hs == s->history + s->max)
- hs = s->history;
- if (hs == s->history + s->start)
- return NULL;
- }
- return hs;
-}
-
-static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
- struct ext4_mb_proc_session *s = seq->private;
- struct ext4_mb_history *hs;
- int l = *pos;
-
- if (l == 0)
- return SEQ_START_TOKEN;
- hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
- if (!hs)
- return NULL;
- while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
- return hs;
-}
-
-static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
- loff_t *pos)
-{
- struct ext4_mb_proc_session *s = seq->private;
- struct ext4_mb_history *hs = v;
-
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
- else
- return ext4_mb_history_skip_empty(s, ++hs, 0);
-}
-
-static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
-{
- char buf[25], buf2[25], buf3[25], *fmt;
- struct ext4_mb_history *hs = v;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
- "%-5s %-2s %-6s %-5s %-5s %-6s\n",
- "pid", "inode", "original", "goal", "result", "found",
- "grps", "cr", "flags", "merge", "tail", "broken");
- return 0;
- }
-
- if (hs->op == EXT4_MB_HISTORY_ALLOC) {
- fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
- "0x%04x %-5s %-5u %-6u\n";
- sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len,
- hs->result.fe_logical);
- sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
- hs->orig.fe_start, hs->orig.fe_len,
- hs->orig.fe_logical);
- sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
- hs->goal.fe_start, hs->goal.fe_len,
- hs->goal.fe_logical);
- seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
- hs->found, hs->groups, hs->cr, hs->flags,
- hs->merged ? "M" : "", hs->tail,
- hs->buddy ? 1 << hs->buddy : 0);
- } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
- fmt = "%-5u %-8u %-23s %-23s %-23s\n";
- sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len,
- hs->result.fe_logical);
- sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
- hs->orig.fe_start, hs->orig.fe_len,
- hs->orig.fe_logical);
- seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
- } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
- sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s discard\n",
- hs->pid, hs->ino, buf2);
- } else if (hs->op == EXT4_MB_HISTORY_FREE) {
- sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s free\n",
- hs->pid, hs->ino, buf2);
- }
- return 0;
-}
-
-static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations ext4_mb_seq_history_ops = {
- .start = ext4_mb_seq_history_start,
- .next = ext4_mb_seq_history_next,
- .stop = ext4_mb_seq_history_stop,
- .show = ext4_mb_seq_history_show,
-};
-
-static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
-{
- struct super_block *sb = PDE(inode)->data;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_mb_proc_session *s;
- int rc;
- int size;
-
- if (unlikely(sbi->s_mb_history == NULL))
- return -ENOMEM;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s == NULL)
- return -ENOMEM;
- s->sb = sb;
- size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
- s->history = kmalloc(size, GFP_KERNEL);
- if (s->history == NULL) {
- kfree(s);
- return -ENOMEM;
- }
-
- spin_lock(&sbi->s_mb_history_lock);
- memcpy(s->history, sbi->s_mb_history, size);
- s->max = sbi->s_mb_history_max;
- s->start = sbi->s_mb_history_cur % s->max;
- spin_unlock(&sbi->s_mb_history_lock);
-
- rc = seq_open(file, &ext4_mb_seq_history_ops);
- if (rc == 0) {
- struct seq_file *m = (struct seq_file *)file->private_data;
- m->private = s;
- } else {
- kfree(s->history);
- kfree(s);
- }
- return rc;
-
-}
-
-static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = (struct seq_file *)file->private_data;
- struct ext4_mb_proc_session *s = seq->private;
- kfree(s->history);
- kfree(s);
- return seq_release(inode, file);
-}
-
-static ssize_t ext4_mb_seq_history_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct seq_file *seq = (struct seq_file *)file->private_data;
- struct ext4_mb_proc_session *s = seq->private;
- struct super_block *sb = s->sb;
- char str[32];
- int value;
-
- if (count >= sizeof(str)) {
- printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
- "mb_history", (int)sizeof(str));
- return -EOVERFLOW;
- }
-
- if (copy_from_user(str, buffer, count))
- return -EFAULT;
-
- value = simple_strtol(str, NULL, 0);
- if (value < 0)
- return -ERANGE;
- EXT4_SB(sb)->s_mb_history_filter = value;
-
- return count;
-}
-
-static const struct file_operations ext4_mb_seq_history_fops = {
- .owner = THIS_MODULE,
- .open = ext4_mb_seq_history_open,
- .read = seq_read,
- .write = ext4_mb_seq_history_write,
- .llseek = seq_lseek,
- .release = ext4_mb_seq_history_release,
-};
-
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
.release = seq_release,
};
-static void ext4_mb_history_release(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_proc != NULL) {
- remove_proc_entry("mb_groups", sbi->s_proc);
- if (sbi->s_mb_history_max)
- remove_proc_entry("mb_history", sbi->s_proc);
- }
- kfree(sbi->s_mb_history);
-}
-
-static void ext4_mb_history_init(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int i;
-
- if (sbi->s_proc != NULL) {
- if (sbi->s_mb_history_max)
- proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_history_fops, sb);
- proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_groups_fops, sb);
- }
-
- sbi->s_mb_history_cur = 0;
- spin_lock_init(&sbi->s_mb_history_lock);
- i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
- /* if we can't allocate history, then we simple won't use it */
-}
-
-static noinline_for_stack void
-ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
- struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_mb_history h;
-
- if (sbi->s_mb_history == NULL)
- return;
-
- if (!(ac->ac_op & sbi->s_mb_history_filter))
- return;
-
- h.op = ac->ac_op;
- h.pid = current->pid;
- h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
- h.orig = ac->ac_o_ex;
- h.result = ac->ac_b_ex;
- h.flags = ac->ac_flags;
- h.found = ac->ac_found;
- h.groups = ac->ac_groups_scanned;
- h.cr = ac->ac_criteria;
- h.tail = ac->ac_tail;
- h.buddy = ac->ac_buddy;
- h.merged = 0;
- if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
- if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
- ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
- h.merged = 1;
- h.goal = ac->ac_g_ex;
- h.result = ac->ac_f_ex;
- }
-
- spin_lock(&sbi->s_mb_history_lock);
- memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
- if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
- sbi->s_mb_history_cur = 0;
- spin_unlock(&sbi->s_mb_history_lock);
-}
-
-#else
-#define ext4_mb_history_release(sb)
-#define ext4_mb_history_init(sb)
-#endif
-
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
- ext4_mb_history_init(sb);
+ if (sbi->s_proc)
+ proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_fops, sb);
if (sbi->s_journal)
sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-
- printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
- ext4_mb_history_release(sb);
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
return 0;
}
@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
atomic_inc(&sbi->s_bal_breaks);
}
- ext4_mb_store_history(ac);
+ if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+ trace_ext4_mballoc_alloc(ac);
+ else
+ trace_ext4_mballoc_prealloc(ac);
}
/*
@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (ac) {
ac->ac_sb = sb;
ac->ac_inode = pa->pa_inode;
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
}
while (bit < end) {
@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = next - bit;
ac->ac_b_ex.fe_logical = 0;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_discard(ac);
}
trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ext4_group_t group;
ext4_grpblk_t bit;
- if (ac)
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-
trace_ext4_mb_release_group_pa(ac, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = pa->pa_len;
ac->ac_b_ex.fe_logical = 0;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_discard(ac);
}
return 0;
@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
- size = max(size, isize);
if ((size == isize) &&
!ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
}
/* don't use group allocation for large files */
+ size = max(size, isize);
if (size >= sbi->s_mb_stream_request) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (ac) {
- ac->ac_op = EXT4_MB_HISTORY_FREE;
ac->ac_inode = inode;
ac->ac_sb = sb;
}
@@ -4806,7 +4527,7 @@ do_more:
ac->ac_b_ex.fe_group = block_group;
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = count;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_free(ac);
}
err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b24..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
#define mb_debug(n, fmt, a...)
#endif
-/*
- * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
- * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
- */
-#define EXT4_MB_HISTORY
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
-#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
-#define EXT4_MB_HISTORY_FREE 8 /* free */
-
-#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
- EXT4_MB_HISTORY_PREALLOC)
/*
* How long mballoc can look for a best extent (in found extents)
@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
-#define MB_DEFAULT_STATS 1
+#define MB_DEFAULT_STATS 0
/*
* files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
#define AC_STATUS_FOUND 2
#define AC_STATUS_BREAK 3
-struct ext4_mb_history {
- struct ext4_free_extent orig; /* orig allocation */
- struct ext4_free_extent goal; /* goal allocation */
- struct ext4_free_extent result; /* result allocation */
- unsigned pid;
- unsigned ino;
- __u16 found; /* how many extents have been found */
- __u16 groups; /* how many groups have been scanned */
- __u16 tail; /* what tail broke some buddy */
- __u16 buddy; /* buddy the tail ^^^ broke */
- __u16 flags;
- __u8 cr:3; /* which phase the result extent was found at */
- __u8 op:4;
- __u8 merged:1;
-};
-
struct ext4_buddy {
struct page *bd_buddy_page;
void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
-#ifndef EXT4_MB_HISTORY
-static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
- return;
-}
-#endif
-
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae6..a93d5b80f3e2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
goto err_out;
}
}
- retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
err_out:
if (path) {
ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40b..25b6b1457360 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, new_ext))
+ orig_path, new_ext, 0))
goto out;
}
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, end_ext))
+ orig_path, end_ext, 0))
goto out;
}
out:
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- /* orig and donor should be different file */
- if (orig_inode->i_ino == donor_inode->i_ino) {
- ext4_debug("ext4 move extent: The argument files should not "
- "be same file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
/* Ext4 move extent supports only extent based file */
if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
int block_len_in_page;
int uninit;
+ /* orig and donor should be different file */
+ if (orig_inode->i_ino == donor_inode->i_ino) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
/* protect orig and donor against a truncate */
ret1 = mext_inode_double_lock(orig_inode, donor_inode);
if (ret1 < 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd5..7c8fe80bacdd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2076,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
- if (!ext4_handle_valid(handle))
+ /* ext4_handle_valid() assumes a valid handle_t pointer */
+ if (handle && !ext4_handle_valid(handle))
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..312211ee05af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
-static int default_mb_history_length = 1000;
-
-module_param_named(default_mb_history_length, default_mb_history_length,
- int, 0644);
-MODULE_PARM_DESC(default_mb_history_length,
- "Default number of entries saved for mb_history");
-
struct proc_dir_entry *ext4_proc_root;
static struct kset *ext4_kset;
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
/*
* Wrappers for jbd2_journal_start/end.
*
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
}
return jbd2_journal_start(journal, nblocks);
}
- /*
- * We're not journaling, return the appropriate indication.
- */
- current->journal_info = EXT4_NOJOURNAL_HANDLE;
- return current->journal_info;
+ return ext4_get_nojournal();
}
/*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
int rc;
if (!ext4_handle_valid(handle)) {
- /*
- * Do this here since we don't call jbd2_journal_stop() in
- * no-journal mode.
- */
- current->journal_info = NULL;
+ ext4_put_nojournal(handle);
return 0;
}
sb = handle->h_transaction->t_journal->j_private;
@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
struct ext4_super_block *es = sbi->s_es;
int i, err;
+ flush_workqueue(sbi->dio_unwritten_wq);
+ destroy_workqueue(sbi->dio_unwritten_wq);
+
lock_super(sb);
lock_kernel();
if (sb->s_dirt)
@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_allocated_meta_blocks = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+ INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+ ei->cur_aio_dio = NULL;
return &ei->vfs_inode;
}
@@ -1052,7 +1072,7 @@ enum {
Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
{Opt_data_writeback, "data=writeback"},
{Opt_data_err_abort, "data_err=abort"},
{Opt_data_err_ignore, "data_err=ignore"},
- {Opt_mb_history_length, "mb_history_length=%u"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -1340,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_data_err_ignore:
clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
break;
- case Opt_mb_history_length:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- sbi->s_mb_history_max = option;
- break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -1646,13 +1658,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
- if (EXT4_SB(sb)->s_journal) {
- ext4_msg(sb, KERN_INFO, "%s journal on %s",
- EXT4_SB(sb)->s_journal->j_inode ? "internal" :
- "external", EXT4_SB(sb)->s_journal->j_devname);
- } else {
- ext4_msg(sb, KERN_INFO, "no journal");
- }
return res;
}
@@ -2197,6 +2202,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2216,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
NULL,
};
@@ -2413,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
- sbi->s_mb_history_max = default_mb_history_length;
set_opt(sbi->s_mount_opt, BARRIER);
@@ -2679,6 +2685,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
+ sbi->s_max_writeback_mb_bump = 128;
/*
* set up enough so that it can read an inode
@@ -2798,6 +2805,12 @@ no_journal:
clear_opt(sbi->s_mount_opt, NOBH);
}
}
+ EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+ if (!EXT4_SB(sb)->dio_unwritten_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ goto failed_mount_wq;
+ }
+
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2862,12 @@ no_journal:
"available");
}
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ if (test_opt(sb, DELALLOC) &&
+ (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
"requested data journaling mode");
clear_opt(sbi->s_mount_opt, DELALLOC);
- } else if (test_opt(sb, DELALLOC))
- ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+ }
err = ext4_setup_system_zone(sb);
if (err) {
@@ -2910,6 +2923,8 @@ cantfind_ext4:
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
+ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
return -EINVAL;
}
- if (journal->j_flags & JBD2_BARRIER)
- ext4_msg(sb, KERN_INFO, "barriers enabled");
- else
+ if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3374,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+ flush_workqueue(sbi->dio_unwritten_wq);
+ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+ jbd2_log_wait_commit(sbi->s_journal, target);
}
return ret;
}
@@ -3951,27 +3966,6 @@ static struct file_system_type ext4_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
-#ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data,struct vfsmount *mnt)
-{
- printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
- "to mount using ext4\n", dev_name);
- printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
- "will go away by 2.6.31\n", dev_name);
- return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
-}
-
-static struct file_system_type ext4dev_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext4dev",
- .get_sb = ext4dev_get_sb,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS("ext4dev");
-#endif
-
static int __init init_ext4_fs(void)
{
int err;
@@ -3996,13 +3990,6 @@ static int __init init_ext4_fs(void)
err = register_filesystem(&ext4_fs_type);
if (err)
goto out;
-#ifdef CONFIG_EXT4DEV_COMPAT
- err = register_filesystem(&ext4dev_fs_type);
- if (err) {
- unregister_filesystem(&ext4_fs_type);
- goto out;
- }
-#endif
return 0;
out:
destroy_inodecache();
@@ -4021,9 +4008,6 @@ out4:
static void __exit exit_ext4_fs(void)
{
unregister_filesystem(&ext4_fs_type);
-#ifdef CONFIG_EXT4DEV_COMPAT
- unregister_filesystem(&ext4dev_fs_type);
-#endif
destroy_inodecache();
exit_ext4_xattr();
exit_ext4_mballoc();
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index adb0e72a176d..7db0979c6b72 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
/* fat/misc.c */
extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3))) __cold;
-extern void fat_clusters_flush(struct super_block *sb);
+extern int fat_clusters_flush(struct super_block *sb);
extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
__le16 __time, __le16 __date, u8 time_cs);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8970d8c49bb0..76b7961ab663 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb)
static int fat_sync_fs(struct super_block *sb, int wait)
{
- lock_super(sb);
- fat_clusters_flush(sb);
- sb->s_dirt = 0;
- unlock_super(sb);
+ int err = 0;
- return 0;
+ if (sb->s_dirt) {
+ lock_super(sb);
+ sb->s_dirt = 0;
+ err = fat_clusters_flush(sb);
+ unlock_super(sb);
+ }
+
+ return err;
}
static void fat_put_super(struct super_block *sb)
@@ -470,19 +474,11 @@ static void fat_put_super(struct super_block *sb)
iput(sbi->fat_inode);
- if (sbi->nls_disk) {
- unload_nls(sbi->nls_disk);
- sbi->nls_disk = NULL;
- sbi->options.codepage = fat_default_codepage;
- }
- if (sbi->nls_io) {
- unload_nls(sbi->nls_io);
- sbi->nls_io = NULL;
- }
- if (sbi->options.iocharset != fat_default_iocharset) {
+ unload_nls(sbi->nls_disk);
+ unload_nls(sbi->nls_io);
+
+ if (sbi->options.iocharset != fat_default_iocharset)
kfree(sbi->options.iocharset);
- sbi->options.iocharset = fat_default_iocharset;
- }
sb->s_fs_info = NULL;
kfree(sbi);
@@ -820,7 +816,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, ",shortname=mixed");
break;
case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
- /* seq_puts(m, ",shortname=lower"); */
+ seq_puts(m, ",shortname=lower");
break;
default:
seq_puts(m, ",shortname=unknown");
@@ -971,7 +967,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
opts->codepage = fat_default_codepage;
opts->iocharset = fat_default_iocharset;
if (is_vfat) {
- opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95;
+ opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
opts->rodir = 0;
} else {
opts->shortname = 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 4e35be873e09..0f55f5cb732f 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error);
/* Flushes the number of free clusters on FAT32 */
/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
-void fat_clusters_flush(struct super_block *sb)
+int fat_clusters_flush(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh;
struct fat_boot_fsinfo *fsinfo;
if (sbi->fat_bits != 32)
- return;
+ return 0;
bh = sb_bread(sb, sbi->fsinfo_sector);
if (bh == NULL) {
printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
- return;
+ return -EIO;
}
fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
@@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb)
mark_buffer_dirty(bh);
}
brelse(bh);
+
+ return 0;
}
/*
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index cb6e83557112..f565f24019b5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
int charlen;
if (utf8) {
- int name_len = strlen(name);
-
- *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
-
- /*
- * We stripped '.'s before and set len appropriately,
- * but utf8s_to_utf16s doesn't care about len
- */
- *outlen -= (name_len - len);
-
- if (*outlen > 255)
+ *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
+ if (*outlen < 0)
+ return *outlen;
+ else if (*outlen > 255)
return -ENAMETOOLONG;
op = &outname[*outlen * sizeof(wchar_t)];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ae413086db97..fc089f2f7f56 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp)
return pid;
}
+static int f_setown_ex(struct file *filp, unsigned long arg)
+{
+ struct f_owner_ex * __user owner_p = (void * __user)arg;
+ struct f_owner_ex owner;
+ struct pid *pid;
+ int type;
+ int ret;
+
+ ret = copy_from_user(&owner, owner_p, sizeof(owner));
+ if (ret)
+ return ret;
+
+ switch (owner.type) {
+ case F_OWNER_TID:
+ type = PIDTYPE_MAX;
+ break;
+
+ case F_OWNER_PID:
+ type = PIDTYPE_PID;
+ break;
+
+ case F_OWNER_GID:
+ type = PIDTYPE_PGID;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+ pid = find_vpid(owner.pid);
+ if (owner.pid && !pid)
+ ret = -ESRCH;
+ else
+ ret = __f_setown(filp, pid, type, 1);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int f_getown_ex(struct file *filp, unsigned long arg)
+{
+ struct f_owner_ex * __user owner_p = (void * __user)arg;
+ struct f_owner_ex owner;
+ int ret = 0;
+
+ read_lock(&filp->f_owner.lock);
+ owner.pid = pid_vnr(filp->f_owner.pid);
+ switch (filp->f_owner.pid_type) {
+ case PIDTYPE_MAX:
+ owner.type = F_OWNER_TID;
+ break;
+
+ case PIDTYPE_PID:
+ owner.type = F_OWNER_PID;
+ break;
+
+ case PIDTYPE_PGID:
+ owner.type = F_OWNER_GID;
+ break;
+
+ default:
+ WARN_ON(1);
+ ret = -EINVAL;
+ break;
+ }
+ read_unlock(&filp->f_owner.lock);
+
+ if (!ret)
+ ret = copy_to_user(owner_p, &owner, sizeof(owner));
+ return ret;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_SETOWN:
err = f_setown(filp, arg, 1);
break;
+ case F_GETOWN_EX:
+ err = f_getown_ex(filp, arg);
+ break;
+ case F_SETOWN_EX:
+ err = f_setown_ex(filp, arg);
+ break;
case F_GETSIG:
err = filp->f_owner.signum;
break;
@@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p,
static void send_sigio_to_task(struct task_struct *p,
struct fown_struct *fown,
- int fd,
- int reason)
+ int fd, int reason, int group)
{
/*
* F_SETSIG can change ->signum lockless in parallel, make
@@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p,
else
si.si_band = band_table[reason - POLL_IN];
si.si_fd = fd;
- if (!group_send_sig_info(signum, &si, p))
+ if (!do_send_sig_info(signum, &si, p, group))
break;
/* fall-through: fall back on the old plain SIGIO signal */
case 0:
- group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
+ do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
}
}
@@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
struct task_struct *p;
enum pid_type type;
struct pid *pid;
+ int group = 1;
read_lock(&fown->lock);
+
type = fown->pid_type;
+ if (type == PIDTYPE_MAX) {
+ group = 0;
+ type = PIDTYPE_PID;
+ }
+
pid = fown->pid;
if (!pid)
goto out_unlock_fown;
read_lock(&tasklist_lock);
do_each_pid_task(pid, type, p) {
- send_sigio_to_task(p, fown, fd, band);
+ send_sigio_to_task(p, fown, fd, band, group);
} while_each_pid_task(pid, type, p);
read_unlock(&tasklist_lock);
out_unlock_fown:
@@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
}
static void send_sigurg_to_task(struct task_struct *p,
- struct fown_struct *fown)
+ struct fown_struct *fown, int group)
{
if (sigio_perm(p, fown, SIGURG))
- group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
+ do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
}
int send_sigurg(struct fown_struct *fown)
@@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown)
struct task_struct *p;
enum pid_type type;
struct pid *pid;
+ int group = 1;
int ret = 0;
read_lock(&fown->lock);
+
type = fown->pid_type;
+ if (type == PIDTYPE_MAX) {
+ group = 0;
+ type = PIDTYPE_PID;
+ }
+
pid = fown->pid;
if (!pid)
goto out_unlock_fown;
@@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown)
read_lock(&tasklist_lock);
do_each_pid_task(pid, type, p) {
- send_sigurg_to_task(p, fown);
+ send_sigurg_to_task(p, fown, group);
} while_each_pid_task(pid, type, p);
read_unlock(&tasklist_lock);
out_unlock_fown:
diff --git a/fs/file.c b/fs/file.c
index f313314f996f..87e129030ab1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/time.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f8..8eb44042e009 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
* Handle nr_files sysctl
*/
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
files_stat.nr_files = get_nr_files();
- return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ return proc_dointvec(table, write, buffer, lenp, ppos);
}
#else
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8e1e5e19d21e..9d5360c4c2af 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,8 +41,9 @@ struct wb_writeback_args {
long nr_pages;
struct super_block *sb;
enum writeback_sync_modes sync_mode;
- int for_kupdate;
- int range_cyclic;
+ int for_kupdate:1;
+ int range_cyclic:1;
+ int for_background:1;
};
/*
@@ -249,14 +250,25 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
* completion. Caller need not hold sb s_umount semaphore.
*
*/
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
+ long nr_pages)
{
struct wb_writeback_args args = {
+ .sb = sb,
.sync_mode = WB_SYNC_NONE,
.nr_pages = nr_pages,
.range_cyclic = 1,
};
+ /*
+ * We treat @nr_pages=0 as the special case to do background writeback,
+ * ie. to sync pages until the background dirty threshold is reached.
+ */
+ if (!nr_pages) {
+ args.nr_pages = LONG_MAX;
+ args.for_background = 1;
+ }
+
bdi_alloc_queue_work(bdi, &args);
}
@@ -310,7 +322,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
* For inodes being constantly redirtied, dirtied_when can get stuck.
* It _appears_ to be in the future, but is actually in distant past.
* This test is necessary to prevent such wrapped-around relative times
- * from permanently stopping the whole pdflush writeback.
+ * from permanently stopping the whole bdi writeback.
*/
ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
@@ -324,13 +336,38 @@ static void move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
unsigned long *older_than_this)
{
+ LIST_HEAD(tmp);
+ struct list_head *pos, *node;
+ struct super_block *sb = NULL;
+ struct inode *inode;
+ int do_sb_sort = 0;
+
while (!list_empty(delaying_queue)) {
- struct inode *inode = list_entry(delaying_queue->prev,
- struct inode, i_list);
+ inode = list_entry(delaying_queue->prev, struct inode, i_list);
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
- list_move(&inode->i_list, dispatch_queue);
+ if (sb && sb != inode->i_sb)
+ do_sb_sort = 1;
+ sb = inode->i_sb;
+ list_move(&inode->i_list, &tmp);
+ }
+
+ /* just one sb in list, splice to dispatch_queue and we're done */
+ if (!do_sb_sort) {
+ list_splice(&tmp, dispatch_queue);
+ return;
+ }
+
+ /* Move inodes from one superblock together */
+ while (!list_empty(&tmp)) {
+ inode = list_entry(tmp.prev, struct inode, i_list);
+ sb = inode->i_sb;
+ list_for_each_prev_safe(pos, node, &tmp) {
+ inode = list_entry(pos, struct inode, i_list);
+ if (inode->i_sb == sb)
+ list_move(&inode->i_list, dispatch_queue);
+ }
}
}
@@ -439,8 +476,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
- if (!(inode->i_state & I_DIRTY) &&
- mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
+ /*
+ * More pages get dirtied by a fast dirtier.
+ */
+ goto select_queue;
+ } else if (inode->i_state & I_DIRTY) {
+ /*
+ * At least XFS will redirty the inode during the
+ * writeback (delalloc) and on io completion (isize).
+ */
+ redirty_tail(inode);
+ } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything. Redirty
@@ -462,6 +509,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* soon as the queue becomes uncongested.
*/
inode->i_state |= I_DIRTY_PAGES;
+select_queue:
if (wbc->nr_to_write <= 0) {
/*
* slice used up: queue for next turn
@@ -484,12 +532,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_state |= I_DIRTY_PAGES;
redirty_tail(inode);
}
- } else if (inode->i_state & I_DIRTY) {
- /*
- * Someone redirtied the inode while were writing back
- * the pages.
- */
- redirty_tail(inode);
} else if (atomic_read(&inode->i_count)) {
/*
* The inode is clean, inuse
@@ -506,6 +548,17 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
+static void unpin_sb_for_writeback(struct super_block **psb)
+{
+ struct super_block *sb = *psb;
+
+ if (sb) {
+ up_read(&sb->s_umount);
+ put_super(sb);
+ *psb = NULL;
+ }
+}
+
/*
* For WB_SYNC_NONE writeback, the caller does not have the sb pinned
* before calling writeback. So make sure that we do pin it, so it doesn't
@@ -515,11 +568,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* 1 if we failed.
*/
static int pin_sb_for_writeback(struct writeback_control *wbc,
- struct inode *inode)
+ struct inode *inode, struct super_block **psb)
{
struct super_block *sb = inode->i_sb;
/*
+ * If this sb is already pinned, nothing more to do. If not and
+ * *psb is non-NULL, unpin the old one first
+ */
+ if (sb == *psb)
+ return 0;
+ else if (*psb)
+ unpin_sb_for_writeback(psb);
+
+ /*
* Caller must already hold the ref for this
*/
if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -532,7 +594,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
if (down_read_trylock(&sb->s_umount)) {
if (sb->s_root) {
spin_unlock(&sb_lock);
- return 0;
+ goto pinned;
}
/*
* umounted, drop rwsem again and fall through to failure
@@ -543,24 +605,15 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
sb->s_count--;
spin_unlock(&sb_lock);
return 1;
-}
-
-static void unpin_sb_for_writeback(struct writeback_control *wbc,
- struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
- return;
-
- up_read(&sb->s_umount);
- put_super(sb);
+pinned:
+ *psb = sb;
+ return 0;
}
static void writeback_inodes_wb(struct bdi_writeback *wb,
struct writeback_control *wbc)
{
- struct super_block *sb = wbc->sb;
+ struct super_block *sb = wbc->sb, *pin_sb = NULL;
const int is_blkdev_sb = sb_is_blkdev_sb(sb);
const unsigned long start = jiffies; /* livelock avoidance */
@@ -619,7 +672,7 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
if (inode_dirtied_after(inode, start))
break;
- if (pin_sb_for_writeback(wbc, inode)) {
+ if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
requeue_io(inode);
continue;
}
@@ -628,7 +681,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
- unpin_sb_for_writeback(wbc, inode);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
@@ -648,6 +700,8 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
wbc->more_io = 1;
}
+ unpin_sb_for_writeback(&pin_sb);
+
spin_unlock(&inode_lock);
/* Leave any unwritten inodes on b_io */
}
@@ -706,6 +760,7 @@ static long wb_writeback(struct bdi_writeback *wb,
};
unsigned long oldest_jif;
long wrote = 0;
+ struct inode *inode;
if (wbc.for_kupdate) {
wbc.older_than_this = &oldest_jif;
@@ -719,20 +774,16 @@ static long wb_writeback(struct bdi_writeback *wb,
for (;;) {
/*
- * Don't flush anything for non-integrity writeback where
- * no nr_pages was given
+ * Stop writeback when nr_pages has been consumed
*/
- if (!args->for_kupdate && args->nr_pages <= 0 &&
- args->sync_mode == WB_SYNC_NONE)
+ if (args->nr_pages <= 0)
break;
/*
- * If no specific pages were given and this is just a
- * periodic background writeout and we are below the
- * background dirty threshold, don't do anything
+ * For background writeout, stop when we are below the
+ * background dirty threshold
*/
- if (args->for_kupdate && args->nr_pages <= 0 &&
- !over_bground_thresh())
+ if (args->for_background && !over_bground_thresh())
break;
wbc.more_io = 0;
@@ -744,13 +795,32 @@ static long wb_writeback(struct bdi_writeback *wb,
wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
/*
- * If we ran out of stuff to write, bail unless more_io got set
+ * If we consumed everything, see if we have more
*/
- if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
- if (wbc.more_io && !wbc.for_kupdate)
- continue;
+ if (wbc.nr_to_write <= 0)
+ continue;
+ /*
+ * Didn't write everything and we don't have more IO, bail
+ */
+ if (!wbc.more_io)
break;
+ /*
+ * Did we write something? Try for more
+ */
+ if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+ continue;
+ /*
+ * Nothing written. Wait for some inode to
+ * become available for writeback. Otherwise
+ * we'll just busyloop.
+ */
+ spin_lock(&inode_lock);
+ if (!list_empty(&wb->b_more_io)) {
+ inode = list_entry(wb->b_more_io.prev,
+ struct inode, i_list);
+ inode_wait_for_writeback(inode);
}
+ spin_unlock(&inode_lock);
}
return wrote;
@@ -1060,9 +1130,6 @@ EXPORT_SYMBOL(__mark_inode_dirty);
* If older_than_this is non-NULL, then only write out inodes which
* had their first dirtying at a time earlier than *older_than_this.
*
- * If we're a pdlfush thread, then implement pdflush collision avoidance
- * against the entire list.
- *
* If `bdi' is non-zero then we're being asked to writeback a specific queue.
* This function assumes that the blockdev superblock's inodes are backed by
* a variety of queues, so all inodes are searched. For other superblocks,
@@ -1141,7 +1208,7 @@ void writeback_inodes_sb(struct super_block *sb)
nr_to_write = nr_dirty + nr_unstable +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
- bdi_writeback_all(sb, nr_to_write);
+ bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
}
EXPORT_SYMBOL(writeback_inodes_sb);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e703654e7f40..992f6c9410bb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1276,14 +1276,9 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
return 0;
if (attr->ia_valid & ATTR_SIZE) {
- unsigned long limit;
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
+ err = inode_newsize_ok(inode, attr->ia_size);
+ if (err)
+ return err;
is_truncate = true;
}
@@ -1350,8 +1345,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
* FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
*/
if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
- if (outarg.attr.size < oldsize)
- fuse_truncate(inode->i_mapping, outarg.attr.size);
+ truncate_pagecache(inode, oldsize, outarg.attr.size);
invalidate_inode_pages2(inode->i_mapping);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index cbc464043b6f..a3492f7d207c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
return 0;
}
-static struct vm_operations_struct fuse_file_vm_ops = {
+static const struct vm_operations_struct fuse_file_vm_ops = {
.close = fuse_vma_close,
.fault = filemap_fault,
.page_mkwrite = fuse_page_mkwrite,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fc9c79feb5f7..01cc462ff45d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -606,8 +606,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid);
-void fuse_truncate(struct address_space *mapping, loff_t offset);
-
/**
* Initialize the client device
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6da947daabda..1a822ce2b24b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -140,14 +140,6 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
return 0;
}
-void fuse_truncate(struct address_space *mapping, loff_t offset)
-{
- /* See vmtruncate() */
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-}
-
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid)
{
@@ -205,8 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
spin_unlock(&fc->lock);
if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
- if (attr->size < oldsize)
- fuse_truncate(inode->i_mapping, attr->size);
+ truncate_pagecache(inode, oldsize, attr->size);
invalidate_inode_pages2(inode->i_mapping);
}
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7ebae9a4ecc0..694b5d48f036 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
.direct_IO = gfs2_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations gfs2_ordered_aops = {
@@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
.direct_IO = gfs2_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations gfs2_jdata_aops = {
@@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
.invalidatepage = gfs2_invalidatepage,
.releasepage = gfs2_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 166f38fbd246..4eb308aa3234 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -418,7 +418,7 @@ out:
return ret;
}
-static struct vm_operations_struct gfs2_vm_ops = {
+static const struct vm_operations_struct gfs2_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = gfs2_page_mkwrite,
};
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c3ac18054057..247436c10deb 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -12,7 +12,6 @@
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/namei.h>
-#include <linux/utsname.h>
#include <linux/mm.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 7b6165f25fbe..8bbe03c3f6d5 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -344,10 +344,8 @@ void hfs_mdb_put(struct super_block *sb)
brelse(HFS_SB(sb)->mdb_bh);
brelse(HFS_SB(sb)->alt_mdb_bh);
- if (HFS_SB(sb)->nls_io)
- unload_nls(HFS_SB(sb)->nls_io);
- if (HFS_SB(sb)->nls_disk)
- unload_nls(HFS_SB(sb)->nls_disk);
+ unload_nls(HFS_SB(sb)->nls_io);
+ unload_nls(HFS_SB(sb)->nls_disk);
free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
kfree(HFS_SB(sb));
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c0759fe0855b..43022f3d5148 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -229,8 +229,7 @@ static void hfsplus_put_super(struct super_block *sb)
iput(HFSPLUS_SB(sb).alloc_file);
iput(HFSPLUS_SB(sb).hidden_dir);
brelse(HFSPLUS_SB(sb).s_vhbh);
- if (HFSPLUS_SB(sb).nls)
- unload_nls(HFSPLUS_SB(sb).nls);
+ unload_nls(HFSPLUS_SB(sb).nls);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
@@ -464,8 +463,7 @@ out:
cleanup:
hfsplus_put_super(sb);
- if (nls)
- unload_nls(nls);
+ unload_nls(nls);
return err;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index eba6d552d9c9..87a1258953b8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -380,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode)
static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
{
- struct super_block *sb = inode->i_sb;
-
- if (!hlist_unhashed(&inode->i_hash)) {
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
- list_move(&inode->i_list, &inode_unused);
- inodes_stat.nr_unused++;
- if (!sb || (sb->s_flags & MS_ACTIVE)) {
- spin_unlock(&inode_lock);
- return;
- }
- inode->i_state |= I_WILL_FREE;
- spin_unlock(&inode_lock);
- /*
- * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
- * in our backing_dev_info.
- */
- write_inode_now(inode, 1);
- spin_lock(&inode_lock);
- inode->i_state &= ~I_WILL_FREE;
- inodes_stat.nr_unused--;
- hlist_del_init(&inode->i_hash);
+ if (generic_detach_inode(inode)) {
+ truncate_hugepages(inode, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
}
- list_del_init(&inode->i_list);
- list_del_init(&inode->i_sb_list);
- inode->i_state |= I_FREEING;
- inodes_stat.nr_inodes--;
- spin_unlock(&inode_lock);
- truncate_hugepages(inode, 0);
- clear_inode(inode);
- destroy_inode(inode);
}
static void hugetlbfs_drop_inode(struct inode *inode)
@@ -936,15 +911,9 @@ static struct file_system_type hugetlbfs_fs_type = {
static struct vfsmount *hugetlbfs_vfsmount;
-static int can_do_hugetlb_shm(int creat_flags)
+static int can_do_hugetlb_shm(void)
{
- if (creat_flags != HUGETLB_SHMFS_INODE)
- return 0;
- if (capable(CAP_IPC_LOCK))
- return 1;
- if (in_group_p(sysctl_hugetlb_shm_group))
- return 1;
- return 0;
+ return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}
struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
@@ -960,7 +929,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
if (!hugetlbfs_vfsmount)
return ERR_PTR(-ENOENT);
- if (!can_do_hugetlb_shm(creat_flags)) {
+ if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
*user = current_user();
if (user_shm_lock(size, *user)) {
WARN_ONCE(1,
diff --git a/fs/inode.c b/fs/inode.c
index 76582b06ab97..4d8e3be55976 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1241,7 +1241,16 @@ void generic_delete_inode(struct inode *inode)
}
EXPORT_SYMBOL(generic_delete_inode);
-static void generic_forget_inode(struct inode *inode)
+/**
+ * generic_detach_inode - remove inode from inode lists
+ * @inode: inode to remove
+ *
+ * Remove inode from inode lists, write it if it's dirty. This is just an
+ * internal VFS helper exported for hugetlbfs. Do not use!
+ *
+ * Returns 1 if inode should be completely destroyed.
+ */
+int generic_detach_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -1251,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode)
inodes_stat.nr_unused++;
if (sb->s_flags & MS_ACTIVE) {
spin_unlock(&inode_lock);
- return;
+ return 0;
}
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_WILL_FREE;
@@ -1269,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode)
inode->i_state |= I_FREEING;
inodes_stat.nr_inodes--;
spin_unlock(&inode_lock);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(generic_detach_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ if (!generic_detach_inode(inode))
+ return;
if (inode->i_data.nrpages)
truncate_inode_pages(&inode->i_data, 0);
clear_inode(inode);
@@ -1399,31 +1416,31 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct timespec now;
- if (mnt_want_write(mnt))
- return;
if (inode->i_flags & S_NOATIME)
- goto out;
+ return;
if (IS_NOATIME(inode))
- goto out;
+ return;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
- goto out;
+ return;
if (mnt->mnt_flags & MNT_NOATIME)
- goto out;
+ return;
if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
- goto out;
+ return;
now = current_fs_time(inode->i_sb);
if (!relatime_need_update(mnt, inode, now))
- goto out;
+ return;
if (timespec_equal(&inode->i_atime, &now))
- goto out;
+ return;
+
+ if (mnt_want_write(mnt))
+ return;
inode->i_atime = now;
mark_inode_dirty_sync(inode);
-out:
mnt_drop_write(mnt);
}
EXPORT_SYMBOL(touch_atime);
@@ -1444,34 +1461,37 @@ void file_update_time(struct file *file)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct timespec now;
- int sync_it = 0;
- int err;
+ enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+ /* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return;
- err = mnt_want_write_file(file);
- if (err)
- return;
-
now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_mtime, &now)) {
- inode->i_mtime = now;
- sync_it = 1;
- }
+ if (!timespec_equal(&inode->i_mtime, &now))
+ sync_it = S_MTIME;
- if (!timespec_equal(&inode->i_ctime, &now)) {
- inode->i_ctime = now;
- sync_it = 1;
- }
+ if (!timespec_equal(&inode->i_ctime, &now))
+ sync_it |= S_CTIME;
- if (IS_I_VERSION(inode)) {
- inode_inc_iversion(inode);
- sync_it = 1;
- }
+ if (IS_I_VERSION(inode))
+ sync_it |= S_VERSION;
+
+ if (!sync_it)
+ return;
- if (sync_it)
- mark_inode_dirty_sync(inode);
+ /* Finally allowed to write? Takes lock. */
+ if (mnt_want_write_file(file))
+ return;
+
+ /* Only change inode inside the lock region */
+ if (sync_it & S_VERSION)
+ inode_inc_iversion(inode);
+ if (sync_it & S_CTIME)
+ inode->i_ctime = now;
+ if (sync_it & S_MTIME)
+ inode->i_mtime = now;
+ mark_inode_dirty_sync(inode);
mnt_drop_write(file->f_path.mnt);
}
EXPORT_SYMBOL(file_update_time);
@@ -1599,7 +1619,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
else if (S_ISSOCK(mode))
inode->i_fop = &bad_sock_fops;
else
- printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
- mode);
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
+ " inode %s:%lu\n", mode, inode->i_sb->s_id,
+ inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);
diff --git a/fs/internal.h b/fs/internal.h
index d55ef562f0bb..515175b8b72e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -57,6 +57,7 @@ extern int check_unsafe_exec(struct linux_binprm *);
* namespace.c
*/
extern int copy_mount_options(const void __user *, unsigned long *);
+extern int copy_mount_string(const void __user *, char **);
extern void free_vfsmnt(struct vfsmount *);
extern struct vfsmount *alloc_vfsmnt(const char *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5612880fcbe7..7b17a14396ff 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -162,20 +162,21 @@ EXPORT_SYMBOL(fiemap_check_flags);
static int fiemap_check_ranges(struct super_block *sb,
u64 start, u64 len, u64 *new_len)
{
+ u64 maxbytes = (u64) sb->s_maxbytes;
+
*new_len = len;
if (len == 0)
return -EINVAL;
- if (start > sb->s_maxbytes)
+ if (start > maxbytes)
return -EFBIG;
/*
* Shrink request scope to what the fs can actually handle.
*/
- if ((len > sb->s_maxbytes) ||
- (sb->s_maxbytes - len) < start)
- *new_len = sb->s_maxbytes - start;
+ if (len > maxbytes || (maxbytes - len) < start)
+ *new_len = maxbytes - start;
return 0;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 85f96bc651c7..6b4dcd4f2943 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -46,10 +46,7 @@ static void isofs_put_super(struct super_block *sb)
#ifdef CONFIG_JOLIET
lock_kernel();
- if (sbi->s_nls_iocharset) {
- unload_nls(sbi->s_nls_iocharset);
- sbi->s_nls_iocharset = NULL;
- }
+ unload_nls(sbi->s_nls_iocharset);
unlock_kernel();
#endif
@@ -912,8 +909,7 @@ out_no_root:
printk(KERN_WARNING "%s: get root inode failed\n", __func__);
out_no_inode:
#ifdef CONFIG_JOLIET
- if (sbi->s_nls_iocharset)
- unload_nls(sbi->s_nls_iocharset);
+ unload_nls(sbi->s_nls_iocharset);
#endif
goto out_freesbi;
out_no_read:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5d70b3e6d49b..ca0f5eb62b20 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -643,6 +643,7 @@ out:
int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
{
+ struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
int ret = 0;
@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
+ stats = &transaction->t_chp_stats;
+ if (stats->cs_chp_time)
+ stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
+ jiffies);
+ trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
+ transaction->t_tid, stats);
__jbd2_journal_drop_transaction(journal, transaction);
kfree(transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 26d991ddc1e6..d4cfd6d2779e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (commit_transaction->t_synchronous_commit)
write_op = WRITE_SYNC_PLUG;
trace_jbd2_commit_locking(journal, commit_transaction);
- stats.u.run.rs_wait = commit_transaction->t_max_wait;
- stats.u.run.rs_locked = jiffies;
- stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
- stats.u.run.rs_locked);
+ stats.run.rs_wait = commit_transaction->t_max_wait;
+ stats.run.rs_locked = jiffies;
+ stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
+ stats.run.rs_locked);
spin_lock(&commit_transaction->t_handle_lock);
while (commit_transaction->t_updates) {
@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_switch_revoke_table(journal);
trace_jbd2_commit_flushing(journal, commit_transaction);
- stats.u.run.rs_flushing = jiffies;
- stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
- stats.u.run.rs_flushing);
+ stats.run.rs_flushing = jiffies;
+ stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
+ stats.run.rs_flushing);
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
spin_unlock(&journal->j_state_lock);
trace_jbd2_commit_logging(journal, commit_transaction);
- stats.u.run.rs_logging = jiffies;
- stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
- stats.u.run.rs_logging);
- stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
- stats.u.run.rs_blocks_logged = 0;
+ stats.run.rs_logging = jiffies;
+ stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
+ stats.run.rs_logging);
+ stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
+ stats.run.rs_blocks_logged = 0;
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
@@ -695,7 +695,7 @@ start_journal_io:
submit_bh(write_op, bh);
}
cond_resched();
- stats.u.run.rs_blocks_logged += bufs;
+ stats.run.rs_blocks_logged += bufs;
/* Force a new descriptor to be generated next
time round the loop. */
@@ -988,33 +988,30 @@ restart_loop:
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_start = jiffies;
- stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
- commit_transaction->t_start);
+ stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
+ commit_transaction->t_start);
/*
- * File the transaction for history
+ * File the transaction statistics
*/
- stats.ts_type = JBD2_STATS_RUN;
stats.ts_tid = commit_transaction->t_tid;
- stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
- spin_lock(&journal->j_history_lock);
- memcpy(journal->j_history + journal->j_history_cur, &stats,
- sizeof(stats));
- if (++journal->j_history_cur == journal->j_history_max)
- journal->j_history_cur = 0;
+ stats.run.rs_handle_count = commit_transaction->t_handle_count;
+ trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
+ commit_transaction->t_tid, &stats.run);
/*
* Calculate overall stats
*/
+ spin_lock(&journal->j_history_lock);
journal->j_stats.ts_tid++;
- journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
- journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
- journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
- journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
- journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
- journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
- journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
- journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
+ journal->j_stats.run.rs_wait += stats.run.rs_wait;
+ journal->j_stats.run.rs_running += stats.run.rs_running;
+ journal->j_stats.run.rs_locked += stats.run.rs_locked;
+ journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+ journal->j_stats.run.rs_logging += stats.run.rs_logging;
+ journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+ journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+ journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
spin_unlock(&journal->j_history_lock);
commit_transaction->t_state = T_FINISHED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 53b86e16e5fe..b0ab5219becb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);
- printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
- "commit interval %ld seconds\n", current->pid,
- journal->j_devname, journal->j_commit_interval / HZ);
-
/*
* And now, wait forever for commit wakeup events.
*/
@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
{
struct task_struct *t;
- t = kthread_run(kjournald2, journal, "kjournald2");
+ t = kthread_run(kjournald2, journal, "jbd2/%s",
+ journal->j_devname);
if (IS_ERR(t))
return PTR_ERR(t);
@@ -679,153 +676,6 @@ struct jbd2_stats_proc_session {
int max;
};
-static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
- struct transaction_stats_s *ts,
- int first)
-{
- if (ts == s->stats + s->max)
- ts = s->stats;
- if (!first && ts == s->stats + s->start)
- return NULL;
- while (ts->ts_type == 0) {
- ts++;
- if (ts == s->stats + s->max)
- ts = s->stats;
- if (ts == s->stats + s->start)
- return NULL;
- }
- return ts;
-
-}
-
-static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
- struct jbd2_stats_proc_session *s = seq->private;
- struct transaction_stats_s *ts;
- int l = *pos;
-
- if (l == 0)
- return SEQ_START_TOKEN;
- ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
- if (!ts)
- return NULL;
- l--;
- while (l) {
- ts = jbd2_history_skip_empty(s, ++ts, 0);
- if (!ts)
- break;
- l--;
- }
- return ts;
-}
-
-static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct jbd2_stats_proc_session *s = seq->private;
- struct transaction_stats_s *ts = v;
-
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return jbd2_history_skip_empty(s, s->stats + s->start, 1);
- else
- return jbd2_history_skip_empty(s, ++ts, 0);
-}
-
-static int jbd2_seq_history_show(struct seq_file *seq, void *v)
-{
- struct transaction_stats_s *ts = v;
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
- "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
- "wait", "run", "lock", "flush", "log", "hndls",
- "block", "inlog", "ctime", "write", "drop",
- "close");
- return 0;
- }
- if (ts->ts_type == JBD2_STATS_RUN)
- seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
- "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
- jiffies_to_msecs(ts->u.run.rs_wait),
- jiffies_to_msecs(ts->u.run.rs_running),
- jiffies_to_msecs(ts->u.run.rs_locked),
- jiffies_to_msecs(ts->u.run.rs_flushing),
- jiffies_to_msecs(ts->u.run.rs_logging),
- ts->u.run.rs_handle_count,
- ts->u.run.rs_blocks,
- ts->u.run.rs_blocks_logged);
- else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
- seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
- "C", ts->ts_tid, " ",
- jiffies_to_msecs(ts->u.chp.cs_chp_time),
- ts->u.chp.cs_written, ts->u.chp.cs_dropped,
- ts->u.chp.cs_forced_to_close);
- else
- J_ASSERT(0);
- return 0;
-}
-
-static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations jbd2_seq_history_ops = {
- .start = jbd2_seq_history_start,
- .next = jbd2_seq_history_next,
- .stop = jbd2_seq_history_stop,
- .show = jbd2_seq_history_show,
-};
-
-static int jbd2_seq_history_open(struct inode *inode, struct file *file)
-{
- journal_t *journal = PDE(inode)->data;
- struct jbd2_stats_proc_session *s;
- int rc, size;
-
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s == NULL)
- return -ENOMEM;
- size = sizeof(struct transaction_stats_s) * journal->j_history_max;
- s->stats = kmalloc(size, GFP_KERNEL);
- if (s->stats == NULL) {
- kfree(s);
- return -ENOMEM;
- }
- spin_lock(&journal->j_history_lock);
- memcpy(s->stats, journal->j_history, size);
- s->max = journal->j_history_max;
- s->start = journal->j_history_cur % s->max;
- spin_unlock(&journal->j_history_lock);
-
- rc = seq_open(file, &jbd2_seq_history_ops);
- if (rc == 0) {
- struct seq_file *m = file->private_data;
- m->private = s;
- } else {
- kfree(s->stats);
- kfree(s);
- }
- return rc;
-
-}
-
-static int jbd2_seq_history_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = file->private_data;
- struct jbd2_stats_proc_session *s = seq->private;
-
- kfree(s->stats);
- kfree(s);
- return seq_release(inode, file);
-}
-
-static struct file_operations jbd2_seq_history_fops = {
- .owner = THIS_MODULE,
- .open = jbd2_seq_history_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = jbd2_seq_history_release,
-};
-
static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
{
return *pos ? NULL : SEQ_START_TOKEN;
@@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
return 0;
- seq_printf(seq, "%lu transaction, each upto %u blocks\n",
+ seq_printf(seq, "%lu transaction, each up to %u blocks\n",
s->stats->ts_tid,
s->journal->j_max_transaction_buffers);
if (s->stats->ts_tid == 0)
return 0;
seq_printf(seq, "average: \n %ums waiting for transaction\n",
- jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
seq_printf(seq, " %ums running transaction\n",
- jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
seq_printf(seq, " %ums transaction was being locked\n",
- jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
seq_printf(seq, " %ums flushing data (in ordered mode)\n",
- jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
seq_printf(seq, " %ums logging transaction\n",
- jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+ jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
seq_printf(seq, " %lluus average transaction commit time\n",
div_u64(s->journal->j_average_commit_time, 1000));
seq_printf(seq, " %lu handles per transaction\n",
- s->stats->u.run.rs_handle_count / s->stats->ts_tid);
+ s->stats->run.rs_handle_count / s->stats->ts_tid);
seq_printf(seq, " %lu blocks per transaction\n",
- s->stats->u.run.rs_blocks / s->stats->ts_tid);
+ s->stats->run.rs_blocks / s->stats->ts_tid);
seq_printf(seq, " %lu logged blocks per transaction\n",
- s->stats->u.run.rs_blocks_logged / s->stats->ts_tid);
+ s->stats->run.rs_blocks_logged / s->stats->ts_tid);
return 0;
}
@@ -920,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static struct file_operations jbd2_seq_info_fops = {
+static const struct file_operations jbd2_seq_info_fops = {
.owner = THIS_MODULE,
.open = jbd2_seq_info_open,
.read = seq_read,
@@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
{
journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
if (journal->j_proc_entry) {
- proc_create_data("history", S_IRUGO, journal->j_proc_entry,
- &jbd2_seq_history_fops, journal);
proc_create_data("info", S_IRUGO, journal->j_proc_entry,
&jbd2_seq_info_fops, journal);
}
@@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
static void jbd2_stats_proc_exit(journal_t *journal)
{
remove_proc_entry("info", journal->j_proc_entry);
- remove_proc_entry("history", journal->j_proc_entry);
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
-static void journal_init_stats(journal_t *journal)
-{
- int size;
-
- if (!proc_jbd2_stats)
- return;
-
- journal->j_history_max = 100;
- size = sizeof(struct transaction_stats_s) * journal->j_history_max;
- journal->j_history = kzalloc(size, GFP_KERNEL);
- if (!journal->j_history) {
- journal->j_history_max = 0;
- return;
- }
- spin_lock_init(&journal->j_history_lock);
-}
-
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
@@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void)
goto fail;
}
- journal_init_stats(journal);
+ spin_lock_init(&journal->j_history_lock);
return journal;
fail:
@@ -1115,7 +945,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
while ((p = strchr(p, '/')))
*p = '!';
p = journal->j_devname + strlen(journal->j_devname);
- sprintf(p, ":%lu", journal->j_inode->i_ino);
+ sprintf(p, "-%lu", journal->j_inode->i_ino);
jbd_debug(1,
"journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
journal, inode->i_sb->s_id, inode->i_ino,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 37e6dcda8fc8..2234c73fc577 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -178,13 +178,11 @@ static void jfs_put_super(struct super_block *sb)
rc = jfs_umount(sb);
if (rc)
jfs_err("jfs_umount failed with return code %d", rc);
- if (sbi->nls_tab)
- unload_nls(sbi->nls_tab);
- sbi->nls_tab = NULL;
+
+ unload_nls(sbi->nls_tab);
truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
iput(sbi->direct_inode);
- sbi->direct_inode = NULL;
kfree(sbi);
@@ -347,8 +345,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
if (nls_map != (void *) -1) {
/* Discard old (if remount) */
- if (sbi->nls_tab)
- unload_nls(sbi->nls_tab);
+ unload_nls(sbi->nls_tab);
sbi->nls_tab = nls_map;
}
return 1;
diff --git a/fs/libfs.c b/fs/libfs.c
index dcec3d3ea64f..219576c52d80 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -527,14 +527,18 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
loff_t pos = *ppos;
+ size_t ret;
+
if (pos < 0)
return -EINVAL;
- if (pos >= available)
+ if (pos >= available || !count)
return 0;
if (count > available - pos)
count = available - pos;
- if (copy_to_user(to, from + pos, count))
+ ret = copy_to_user(to, from + pos, count);
+ if (ret == count)
return -EFAULT;
+ count -= ret;
*ppos = pos + count;
return count;
}
@@ -735,10 +739,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
if (copy_from_user(attr->set_buf, buf, size))
goto out;
- ret = len; /* claim we got the whole input */
attr->set_buf[size] = '\0';
val = simple_strtol(attr->set_buf, NULL, 0);
- attr->set(attr->data, val);
+ ret = attr->set(attr->data, val);
+ if (ret == 0)
+ ret = len; /* on success, claim we got the whole input */
out:
mutex_unlock(&attr->mutex);
return ret;
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0336f2beacde..b583ab0a4cbb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -8,7 +8,6 @@
#include <linux/types.h>
#include <linux/sched.h>
-#include <linux/utsname.h>
#include <linux/nfs.h>
#include <linux/sunrpc/xdr.h>
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index e1d528653192..ad9dbbc9145d 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -9,7 +9,6 @@
#include <linux/types.h>
#include <linux/sched.h>
-#include <linux/utsname.h>
#include <linux/nfs.h>
#include <linux/sunrpc/xdr.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index 7230787d18b0..bdc3cb4fd222 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,7 +1640,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
{
struct vfsmount *mnt;
- if (!type || !memchr(type, 0, PAGE_SIZE))
+ if (!type)
return -EINVAL;
/* we need capabilities... */
@@ -1871,6 +1871,23 @@ int copy_mount_options(const void __user * data, unsigned long *where)
return 0;
}
+int copy_mount_string(const void __user *data, char **where)
+{
+ char *tmp;
+
+ if (!data) {
+ *where = NULL;
+ return 0;
+ }
+
+ tmp = strndup_user(data, PAGE_SIZE);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ *where = tmp;
+ return 0;
+}
+
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1900,8 +1917,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
return -EINVAL;
- if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
- return -EINVAL;
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
@@ -2070,40 +2085,42 @@ EXPORT_SYMBOL(create_mnt_ns);
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
- int retval;
+ int ret;
+ char *kernel_type;
+ char *kernel_dir;
+ char *kernel_dev;
unsigned long data_page;
- unsigned long type_page;
- unsigned long dev_page;
- char *dir_page;
- retval = copy_mount_options(type, &type_page);
- if (retval < 0)
- return retval;
+ ret = copy_mount_string(type, &kernel_type);
+ if (ret < 0)
+ goto out_type;
- dir_page = getname(dir_name);
- retval = PTR_ERR(dir_page);
- if (IS_ERR(dir_page))
- goto out1;
+ kernel_dir = getname(dir_name);
+ if (IS_ERR(kernel_dir)) {
+ ret = PTR_ERR(kernel_dir);
+ goto out_dir;
+ }
- retval = copy_mount_options(dev_name, &dev_page);
- if (retval < 0)
- goto out2;
+ ret = copy_mount_string(dev_name, &kernel_dev);
+ if (ret < 0)
+ goto out_dev;
- retval = copy_mount_options(data, &data_page);
- if (retval < 0)
- goto out3;
+ ret = copy_mount_options(data, &data_page);
+ if (ret < 0)
+ goto out_data;
- retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
- flags, (void *)data_page);
- free_page(data_page);
+ ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
+ (void *) data_page);
-out3:
- free_page(dev_page);
-out2:
- putname(dir_page);
-out1:
- free_page(type_page);
- return retval;
+ free_page(data_page);
+out_data:
+ kfree(kernel_dev);
+out_dev:
+ putname(kernel_dir);
+out_dir:
+ kfree(kernel_type);
+out_type:
+ return ret;
}
/*
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b99ce205b1bd..cf98da1be23e 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -746,16 +746,8 @@ static void ncp_put_super(struct super_block *sb)
#ifdef CONFIG_NCPFS_NLS
/* unload the NLS charsets */
- if (server->nls_vol)
- {
- unload_nls(server->nls_vol);
- server->nls_vol = NULL;
- }
- if (server->nls_io)
- {
- unload_nls(server->nls_io);
- server->nls_io = NULL;
- }
+ unload_nls(server->nls_vol);
+ unload_nls(server->nls_io);
#endif /* CONFIG_NCPFS_NLS */
if (server->info_filp)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 53a7ed7eb9c6..0d58caf4a6e1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -223,10 +223,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
oldset_io = server->nls_io;
server->nls_io = iocharset;
- if (oldset_cp)
- unload_nls(oldset_cp);
- if (oldset_io)
- unload_nls(oldset_io);
+ unload_nls(oldset_cp);
+ unload_nls(oldset_io);
return 0;
}
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 5d8dcb9ee326..15458decdb8a 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -95,7 +95,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
return VM_FAULT_MAJOR;
}
-static struct vm_operations_struct ncp_file_mmap =
+static const struct vm_operations_struct ncp_file_mmap =
{
.fault = ncp_file_mmap_fault,
};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 152025358dad..99ea196f071f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -648,8 +648,6 @@ static int nfs_start_lockd(struct nfs_server *server)
.hostname = clp->cl_hostname,
.address = (struct sockaddr *)&clp->cl_addr,
.addrlen = clp->cl_addrlen,
- .protocol = server->flags & NFS_MOUNT_TCP ?
- IPPROTO_TCP : IPPROTO_UDP,
.nfs_version = clp->rpc_ops->version,
.noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
1 : 0,
@@ -660,6 +658,14 @@ static int nfs_start_lockd(struct nfs_server *server)
if (server->flags & NFS_MOUNT_NONLM)
return 0;
+ switch (clp->cl_proto) {
+ default:
+ nlm_init.protocol = IPPROTO_TCP;
+ break;
+ case XPRT_TRANSPORT_UDP:
+ nlm_init.protocol = IPPROTO_UDP;
+ }
+
host = nlmclnt_init(&nlm_init);
if (IS_ERR(host))
return PTR_ERR(host);
@@ -787,7 +793,7 @@ static int nfs_init_server(struct nfs_server *server,
dprintk("--> nfs_init_server()\n");
#ifdef CONFIG_NFS_V3
- if (data->flags & NFS_MOUNT_VER3)
+ if (data->version == 3)
cl_init.rpc_ops = &nfs_v3_clientops;
#endif
@@ -964,6 +970,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
target->acdirmin = source->acdirmin;
target->acdirmax = source->acdirmax;
target->caps = source->caps;
+ target->options = source->options;
}
/*
@@ -1173,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp,
1, flags & NFS_MOUNT_NORESVPORT);
if (error < 0)
goto error;
- memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+ strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
error = nfs_idmap_new(clp);
if (error < 0) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5021b75d2d1e..f5fdd39e037a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -59,7 +59,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
-static struct vm_operations_struct nfs_file_vm_ops;
+static const struct vm_operations_struct nfs_file_vm_ops;
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
@@ -525,6 +525,7 @@ const struct address_space_operations nfs_file_aops = {
.direct_IO = nfs_direct_IO,
.migratepage = nfs_migrate_page,
.launder_page = nfs_launder_page,
+ .error_remove_page = generic_error_remove_page,
};
/*
@@ -571,7 +572,7 @@ out_unlock:
return VM_FAULT_SIGBUS;
}
-static struct vm_operations_struct nfs_file_vm_ops = {
+static const struct vm_operations_struct nfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = nfs_vm_page_mkwrite,
};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 379be678cb7e..70fad69eb959 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -58,17 +58,34 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
/*
* Get the cache cookie for an NFS superblock. We have to handle
* uniquification here because the cache doesn't do it for us.
+ *
+ * The default uniquifier is just an empty string, but it may be overridden
+ * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
+ * superblock across an automount point of some nature.
*/
-void nfs_fscache_get_super_cookie(struct super_block *sb,
- struct nfs_parsed_mount_data *data)
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
+ struct nfs_clone_mount *mntdata)
{
struct nfs_fscache_key *key, *xkey;
struct nfs_server *nfss = NFS_SB(sb);
struct rb_node **p, *parent;
- const char *uniq = data->fscache_uniq ?: "";
int diff, ulen;
- ulen = strlen(uniq);
+ if (uniq) {
+ ulen = strlen(uniq);
+ } else if (mntdata) {
+ struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
+ if (mnt_s->fscache_key) {
+ uniq = mnt_s->fscache_key->key.uniquifier;
+ ulen = mnt_s->fscache_key->key.uniq_len;
+ }
+ }
+
+ if (!uniq) {
+ uniq = "";
+ ulen = 1;
+ }
+
key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
if (!key)
return;
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 6e809bb0ff08..b9c572d0679f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -74,7 +74,8 @@ extern void nfs_fscache_get_client_cookie(struct nfs_client *);
extern void nfs_fscache_release_client_cookie(struct nfs_client *);
extern void nfs_fscache_get_super_cookie(struct super_block *,
- struct nfs_parsed_mount_data *);
+ const char *,
+ struct nfs_clone_mount *);
extern void nfs_fscache_release_super_cookie(struct super_block *);
extern void nfs_fscache_init_inode_cookie(struct inode *);
@@ -173,7 +174,8 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
static inline void nfs_fscache_get_super_cookie(
struct super_block *sb,
- struct nfs_parsed_mount_data *data)
+ const char *uniq,
+ struct nfs_clone_mount *mntdata)
{
}
static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 060022b4651c..faa091865ad0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -458,49 +458,21 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
*/
static int nfs_vmtruncate(struct inode * inode, loff_t offset)
{
- if (i_size_read(inode) < offset) {
- unsigned long limit;
-
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit)
- goto out_sig;
- if (offset > inode->i_sb->s_maxbytes)
- goto out_big;
- spin_lock(&inode->i_lock);
- i_size_write(inode, offset);
- spin_unlock(&inode->i_lock);
- } else {
- struct address_space *mapping = inode->i_mapping;
+ loff_t oldsize;
+ int err;
- /*
- * truncation of in-use swapfiles is disallowed - it would
- * cause subsequent swapout to scribble on the now-freed
- * blocks.
- */
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
- spin_lock(&inode->i_lock);
- i_size_write(inode, offset);
- spin_unlock(&inode->i_lock);
+ err = inode_newsize_ok(inode, offset);
+ if (err)
+ goto out;
- /*
- * unmap_mapping_range is called twice, first simply for
- * efficiency so that truncate_inode_pages does fewer
- * single-page unmaps. However after this first call, and
- * before truncate_inode_pages finishes, it is possible for
- * private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second
- * unmap_mapping_range call must be made for correctness.
- */
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- }
- return 0;
-out_sig:
- send_sig(SIGXFSZ, current, 0);
-out_big:
- return -EFBIG;
+ spin_lock(&inode->i_lock);
+ oldsize = inode->i_size;
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+
+ truncate_pagecache(inode, oldsize, offset);
+out:
+ return err;
}
/**
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c862c9340f9a..5e078b222b4e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -13,7 +13,6 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/in.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ee6a13f05443..3f8881d1a050 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -7,7 +7,6 @@
*/
#include <linux/mm.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/sunrpc/clnt.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 35869a4921f1..5fe5492fbd29 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -10,7 +10,6 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/in.h>
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2636c26d56fa..fa3408f20112 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
if (IS_ERR(mnt_path))
- return mnt;
+ return ERR_CAST(mnt_path);
mountdata->mnt_path = mnt_path;
maxbuflen = mnt_path - 1 - page2;
@@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
if (buf->len <= 0 || buf->len >= maxbuflen)
continue;
- mountdata->addr = (struct sockaddr *)&addr;
-
if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
continue;
- mountdata->addrlen = nfs_parse_server_name(buf->data,
- buf->len,
- mountdata->addr, mountdata->addrlen);
+
+ mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
+ (struct sockaddr *)&addr, sizeof(addr));
if (mountdata->addrlen == 0)
continue;
+
+ mountdata->addr = (struct sockaddr *)&addr;
rpc_set_port(mountdata->addr, NFS_PORT);
memcpy(page2, buf->data, buf->len);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index be6544aef41f..ed7c269e2514 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -36,7 +36,6 @@
*/
#include <linux/mm.h>
-#include <linux/utsname.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/string.h>
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e27c6cef18f2..0156c01c212c 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
}
void
-nfs4_renewd_prepare_shutdown(struct nfs_server *server)
-{
- cancel_delayed_work(&server->nfs_client->cl_renewd);
-}
-
-void
nfs4_kill_renewd(struct nfs_client *clp)
{
cancel_delayed_work_sync(&clp->cl_renewd);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cfc30d362f94..83ad47cbdd8a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -39,7 +39,6 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/in.h>
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be72d90d49d..ef583854d8d0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -32,7 +32,6 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/mm.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/in.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1cc0587cfef..a2c18acb8568 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -728,6 +728,29 @@ static void nfs_umount_begin(struct super_block *sb)
unlock_kernel();
}
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
+{
+ struct nfs_parsed_mount_data *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data) {
+ data->rsize = NFS_MAX_FILE_IO_SIZE;
+ data->wsize = NFS_MAX_FILE_IO_SIZE;
+ data->acregmin = NFS_DEF_ACREGMIN;
+ data->acregmax = NFS_DEF_ACREGMAX;
+ data->acdirmin = NFS_DEF_ACDIRMIN;
+ data->acdirmax = NFS_DEF_ACDIRMAX;
+ data->mount_server.port = NFS_UNSPEC_PORT;
+ data->nfs_server.port = NFS_UNSPEC_PORT;
+ data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ data->auth_flavors[0] = RPC_AUTH_UNIX;
+ data->auth_flavor_len = 1;
+ data->version = version;
+ data->minorversion = 0;
+ }
+ return data;
+}
+
/*
* Sanity-check a server address provided by the mount command.
*
@@ -755,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr)
* Select between a default port value and a user-specified port value.
* If a zero value is set, then autobind will be used.
*/
-static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
+static void nfs_set_port(struct sockaddr *sap, int *port,
const unsigned short default_port)
{
- unsigned short port = default_port;
-
- if (parsed_port != NFS_UNSPEC_PORT)
- port = parsed_port;
+ if (*port == NFS_UNSPEC_PORT)
+ *port = default_port;
- rpc_set_port(sap, port);
+ rpc_set_port(sap, *port);
}
/*
@@ -1430,10 +1451,13 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
int status;
if (args->mount_server.version == 0) {
- if (args->flags & NFS_MOUNT_VER3)
- args->mount_server.version = NFS_MNT3_VERSION;
- else
- args->mount_server.version = NFS_MNT_VERSION;
+ switch (args->version) {
+ default:
+ args->mount_server.version = NFS_MNT3_VERSION;
+ break;
+ case 2:
+ args->mount_server.version = NFS_MNT_VERSION;
+ }
}
request.version = args->mount_server.version;
@@ -1451,7 +1475,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
args->mount_server.addrlen = args->nfs_server.addrlen;
}
request.salen = args->mount_server.addrlen;
- nfs_set_default_port(request.sap, args->mount_server.port, 0);
+ nfs_set_port(request.sap, &args->mount_server.port, 0);
/*
* Now ask the mount server to map our export path
@@ -1634,20 +1658,6 @@ static int nfs_validate_mount_data(void *options,
if (data == NULL)
goto out_no_data;
- args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
- args->rsize = NFS_MAX_FILE_IO_SIZE;
- args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->acregmin = NFS_DEF_ACREGMIN;
- args->acregmax = NFS_DEF_ACREGMAX;
- args->acdirmin = NFS_DEF_ACDIRMIN;
- args->acdirmax = NFS_DEF_ACDIRMAX;
- args->mount_server.port = NFS_UNSPEC_PORT;
- args->nfs_server.port = NFS_UNSPEC_PORT;
- args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- args->auth_flavor_len = 1;
- args->minorversion = 0;
-
switch (data->version) {
case 1:
data->namlen = 0;
@@ -1755,7 +1765,7 @@ static int nfs_validate_mount_data(void *options,
goto out_v4_not_compiled;
#endif
- nfs_set_default_port(sap, args->nfs_server.port, 0);
+ nfs_set_port(sap, &args->nfs_server.port, 0);
nfs_set_mount_transport_protocol(args);
@@ -1778,7 +1788,7 @@ static int nfs_validate_mount_data(void *options,
}
#ifndef CONFIG_NFS_V3
- if (args->flags & NFS_MOUNT_VER3)
+ if (args->version == 3)
goto out_v3_not_compiled;
#endif /* !CONFIG_NFS_V3 */
@@ -1836,9 +1846,10 @@ nfs_compare_remount_data(struct nfs_server *nfss,
data->acdirmin != nfss->acdirmin / HZ ||
data->acdirmax != nfss->acdirmax / HZ ||
data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ data->nfs_server.port != nfss->port ||
data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
- memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
- data->nfs_server.addrlen) != 0)
+ !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
+ (struct sockaddr *)&nfss->nfs_client->cl_addr))
return -EINVAL;
return 0;
@@ -1881,6 +1892,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
data->acdirmin = nfss->acdirmin / HZ;
data->acdirmax = nfss->acdirmax / HZ;
data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+ data->nfs_server.port = nfss->port;
data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
data->nfs_server.addrlen);
@@ -1936,7 +1948,7 @@ static void nfs_fill_super(struct super_block *sb,
if (data->bsize)
sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
- if (server->flags & NFS_MOUNT_VER3) {
+ if (server->nfs_client->rpc_ops->version == 3) {
/* The VFS shouldn't apply the umask to mode bits. We will do
* so ourselves when necessary.
*/
@@ -1960,7 +1972,7 @@ static void nfs_clone_super(struct super_block *sb,
sb->s_blocksize = old_sb->s_blocksize;
sb->s_maxbytes = old_sb->s_maxbytes;
- if (server->flags & NFS_MOUNT_VER3) {
+ if (server->nfs_client->rpc_ops->version == 3) {
/* The VFS shouldn't apply the umask to mode bits. We will do
* so ourselves when necessary.
*/
@@ -2094,7 +2106,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
};
int error = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = nfs_alloc_parsed_mount_data(3);
mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
if (data == NULL || mntfh == NULL)
goto out_free_fh;
@@ -2144,7 +2156,8 @@ static int nfs_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
nfs_fill_super(s, data);
- nfs_fscache_get_super_cookie(s, data);
+ nfs_fscache_get_super_cookie(
+ s, data ? data->fscache_uniq : NULL, NULL);
}
mntroot = nfs_get_root(s, mntfh);
@@ -2245,6 +2258,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
if (!s->s_root) {
/* initial superblock/root creation */
nfs_clone_super(s, data->sb);
+ nfs_fscache_get_super_cookie(s, NULL, data);
}
mntroot = nfs_get_root(s, data->fh);
@@ -2317,7 +2331,7 @@ static int nfs4_validate_text_mount_data(void *options,
{
struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
- nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
+ nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
nfs_validate_transport_protocol(args);
@@ -2362,18 +2376,6 @@ static int nfs4_validate_mount_data(void *options,
if (data == NULL)
goto out_no_data;
- args->rsize = NFS_MAX_FILE_IO_SIZE;
- args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->acregmin = NFS_DEF_ACREGMIN;
- args->acregmax = NFS_DEF_ACREGMAX;
- args->acdirmin = NFS_DEF_ACDIRMIN;
- args->acdirmax = NFS_DEF_ACDIRMAX;
- args->nfs_server.port = NFS_UNSPEC_PORT;
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- args->auth_flavor_len = 1;
- args->version = 4;
- args->minorversion = 0;
-
switch (data->version) {
case 1:
if (data->host_addrlen > sizeof(args->nfs_server.address))
@@ -2508,7 +2510,8 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
nfs4_fill_super(s);
- nfs_fscache_get_super_cookie(s, data);
+ nfs_fscache_get_super_cookie(
+ s, data ? data->fscache_uniq : NULL, NULL);
}
mntroot = nfs4_get_root(s, mntfh);
@@ -2656,7 +2659,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
struct nfs_parsed_mount_data *data;
int error = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = nfs_alloc_parsed_mount_data(4);
if (data == NULL)
goto out_free_data;
@@ -2686,7 +2689,6 @@ static void nfs4_kill_super(struct super_block *sb)
dprintk("--> %s\n", __func__);
nfs_super_return_all_delegations(sb);
kill_anon_super(sb);
- nfs4_renewd_prepare_shutdown(server);
nfs_fscache_release_super_cookie(sb);
nfs_free_server(server);
dprintk("<-- %s\n", __func__);
@@ -2741,6 +2743,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
if (!s->s_root) {
/* initial superblock/root creation */
nfs4_clone_super(s, data->sb);
+ nfs_fscache_get_super_cookie(s, NULL, data);
}
mntroot = nfs4_get_root(s, data->fh);
@@ -2822,6 +2825,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
nfs4_fill_super(s);
+ nfs_fscache_get_super_cookie(s, NULL, data);
}
mntroot = nfs4_get_root(s, &mntfh);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index cdfa86fa1471..ba2c199592fd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -38,7 +38,6 @@
#include <linux/init.h>
#include <linux/mm.h>
-#include <linux/utsname.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/sunrpc/clnt.h>
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 00388d2a3c99..5c01fc148ce8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -176,7 +176,7 @@ static const struct file_operations exports_operations = {
extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
-static struct file_operations pool_stats_operations = {
+static const struct file_operations pool_stats_operations = {
.open = nfsd_pool_stats_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 6a2711f4c321..5941958f1e47 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -36,6 +36,7 @@
void nilfs_btnode_cache_init_once(struct address_space *btnc)
{
+ memset(btnc, 0, sizeof(*btnc));
INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
spin_lock_init(&btnc->tree_lock);
INIT_LIST_HEAD(&btnc->private_list);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf071..e097099bfc8f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
return 0;
}
-struct file_operations nilfs_dir_operations = {
+const struct file_operations nilfs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = nilfs_readdir,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index fc8278c77cdd..30292df443ce 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -117,7 +117,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
return 0;
}
-struct vm_operations_struct nilfs_file_vm_ops = {
+static const struct vm_operations_struct nilfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = nilfs_page_mkwrite,
};
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* We have mostly NULL's here: the current defaults are ok for
* the nilfs filesystem.
*/
-struct file_operations nilfs_file_operations = {
+const struct file_operations nilfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2d2c501deb54..5040220c3732 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode,
ii->i_dir_acl = S_ISREG(inode->i_mode) ?
0 : le32_to_cpu(raw_inode->i_dir_acl);
#endif
+ ii->i_dir_start_lookup = 0;
ii->i_cno = 0;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index b18c4998f8d0..f6326112d647 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = {
};
static const struct inode_operations def_mdt_iops;
-static struct file_operations def_mdt_fops;
+static const struct file_operations def_mdt_fops;
/*
* NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index bad7368782d0..4da6f67e9a91 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
/*
* Inodes and files operations
*/
-extern struct file_operations nilfs_dir_operations;
+extern const struct file_operations nilfs_dir_operations;
extern const struct inode_operations nilfs_file_inode_operations;
-extern struct file_operations nilfs_file_operations;
+extern const struct file_operations nilfs_file_operations;
extern const struct address_space_operations nilfs_aops;
extern const struct inode_operations nilfs_dir_inode_operations;
extern const struct inode_operations nilfs_special_inode_operations;
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 477d37d83b31..44a88a9fa2c8 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
while (*s && len > 0) {
if (*s & 0x80) {
size = utf8_to_utf32(s, len, &u);
- if (size < 0) {
- /* Ignore character and move on */
- size = 1;
- } else if (u >= PLANE_SIZE) {
+ if (size < 0)
+ return -EINVAL;
+
+ if (u >= PLANE_SIZE) {
u -= PLANE_SIZE;
*op++ = (wchar_t) (SURROGATE_PAIR |
((u >> 10) & SURROGATE_BITS));
@@ -270,7 +270,8 @@ struct nls_table *load_nls(char *charset)
void unload_nls(struct nls_table *nls)
{
- module_put(nls->owner);
+ if (nls)
+ module_put(nls->owner);
}
static const wchar_t charset2uni[256] = {
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b38f944f0667..cfce53cb65d7 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
.migratepage = buffer_migrate_page, /* Move a page cache page from
one physical page to an
other. */
+ .error_remove_page = generic_error_remove_page,
};
/**
@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
.migratepage = buffer_migrate_page, /* Move a page cache page from
one physical page to an
other. */
+ .error_remove_page = generic_error_remove_page,
};
#ifdef NTFS_RW
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index abaaa1cbf8de..80b04770e8e9 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -201,8 +201,7 @@ use_utf8:
v, old_nls->charset);
nls_map = old_nls;
} else /* nls_map */ {
- if (old_nls)
- unload_nls(old_nls);
+ unload_nls(old_nls);
}
} else if (!strcmp(p, "utf8")) {
bool val = false;
@@ -2427,10 +2426,9 @@ static void ntfs_put_super(struct super_block *sb)
ntfs_free(vol->upcase);
vol->upcase = NULL;
}
- if (vol->nls_map) {
- unload_nls(vol->nls_map);
- vol->nls_map = NULL;
- }
+
+ unload_nls(vol->nls_map);
+
sb->s_fs_info = NULL;
kfree(vol);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 72e76062a900..deb2b132ae5e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2022,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d04611..c452d116b892 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
}
#endif /* CONFIG_DEBUG_FS */
-static struct file_operations o2hb_debug_fops = {
+static const struct file_operations o2hb_debug_fops = {
.open = o2hb_debug_open,
.release = o2hb_debug_release,
.read = o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index cfb2be708abe..da794bc07a6c 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
return seq_release_private(inode, file);
}
-static struct file_operations nst_seq_fops = {
+static const struct file_operations nst_seq_fops = {
.open = nst_fop_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
return seq_release_private(inode, file);
}
-static struct file_operations sc_seq_fops = {
+static const struct file_operations sc_seq_fops = {
.open = sc_fop_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 81eff8e58322..01cf8cc3d286 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf3..ca96bce50e18 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c5c88124096d..42b0bad7a612 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,7 +27,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
@@ -479,7 +478,7 @@ bail:
return -ENOMEM;
}
-static struct file_operations debug_purgelist_fops = {
+static const struct file_operations debug_purgelist_fops = {
.open = debug_purgelist_open,
.release = debug_buffer_release,
.read = debug_buffer_read,
@@ -539,7 +538,7 @@ bail:
return -ENOMEM;
}
-static struct file_operations debug_mle_fops = {
+static const struct file_operations debug_mle_fops = {
.open = debug_mle_open,
.release = debug_buffer_release,
.read = debug_buffer_read,
@@ -742,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
return seq_release_private(inode, file);
}
-static struct file_operations debug_lockres_fops = {
+static const struct file_operations debug_lockres_fops = {
.open = debug_lockres_open,
.release = debug_lockres_release,
.read = seq_read,
@@ -926,7 +925,7 @@ bail:
return -ENOMEM;
}
-static struct file_operations debug_state_fops = {
+static const struct file_operations debug_state_fops = {
.open = debug_state_open,
.release = debug_buffer_release,
.read = debug_buffer_read,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd8..0334000676d3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac8..437698e9465f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4dd..83bcaf266b35 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 43e6e3280569..d9fa3d22e17c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 98569e86c613..52ec020ea78b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 756f5b0998e0..00f53b2aea76 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index b606496b72ec..39737613424a 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -202,7 +202,7 @@ out:
return ret;
}
-static struct vm_operations_struct ocfs2_file_vm_ops = {
+static const struct vm_operations_struct ocfs2_file_vm_ops = {
.fault = ocfs2_fault,
.page_mkwrite = ocfs2_page_mkwrite,
};
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 24feb449a1dc..c0e48aeebb1c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/random.h>
#include <linux/statfs.h>
@@ -374,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
}
#endif /* CONFIG_DEBUG_FS */
-static struct file_operations ocfs2_osb_debug_fops = {
+static const struct file_operations ocfs2_osb_debug_fops = {
.open = ocfs2_osb_debug_open,
.release = ocfs2_debug_release,
.read = ocfs2_debug_read,
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110f..e3421030a69f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,7 +38,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
-#include <linux/utsname.h>
#include <linux/namei.h>
#define MLOG_MASK_PREFIX ML_NAMEI
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3680bae335b5..b42d62419034 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = {
.rmdir = omfs_rmdir,
};
-struct file_operations omfs_dir_operations = {
+const struct file_operations omfs_dir_operations = {
.read = generic_read_dir,
.readdir = omfs_readdir,
.llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 4845fbb18e6e..399487c09364 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, omfs_get_block);
}
-struct file_operations omfs_file_operations = {
+const struct file_operations omfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index df71039945ac..ebe2fdbe535e 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
/* dir.c */
-extern struct file_operations omfs_dir_operations;
+extern const struct file_operations omfs_dir_operations;
extern const struct inode_operations omfs_dir_inops;
extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
u64 fsblock);
/* file.c */
-extern struct file_operations omfs_file_operations;
+extern const struct file_operations omfs_file_operations;
extern const struct inode_operations omfs_file_inops;
extern const struct address_space_operations omfs_aops;
extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0c6bc602e6c4..07f77a7945c3 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -322,6 +322,8 @@ static inline void task_context_switch_counts(struct seq_file *m,
p->nivcsw);
}
+#ifdef CONFIG_MMU
+
struct stack_stats {
struct vm_area_struct *vma;
unsigned long startpage;
@@ -402,6 +404,11 @@ static inline void task_show_stack_usage(struct seq_file *m,
mmput(mm);
}
}
+#else
+static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
+{
+}
+#endif /* CONFIG_MMU */
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 56013371f9f3..a44a7897fd4d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,7 +23,6 @@
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
-#include <linux/mm.h>
#include <linux/memory.h>
#include <asm/sections.h>
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 171e052c07b3..c7bff4f603ff 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -97,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
"Committed_AS: %8lu kB\n"
"VmallocTotal: %8lu kB\n"
"VmallocUsed: %8lu kB\n"
- "VmallocChunk: %8lu kB\n",
+ "VmallocChunk: %8lu kB\n"
+#ifdef CONFIG_MEMORY_FAILURE
+ "HardwareCorrupted: %8lu kB\n"
+#endif
+ ,
K(i.totalram),
K(i.freeram),
K(i.bufferram),
@@ -144,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
(unsigned long)VMALLOC_TOTAL >> 10,
vmi.used >> 10,
vmi.largest_chunk >> 10
+#ifdef CONFIG_MEMORY_FAILURE
+ ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+#endif
);
hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2281c2cbfe2b..5033ce0d254b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = {
#define KPF_COMPOUND_TAIL 16
#define KPF_HUGE 17
#define KPF_UNEVICTABLE 18
+#define KPF_HWPOISON 19
#define KPF_NOPAGE 20
#define KPF_KSM 21
@@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page)
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
+#ifdef CONFIG_MEMORY_FAILURE
+ u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+#endif
+
#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
#endif
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16bf..f667e8aeabdf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
/* careful: calling conventions are nasty here */
res = count;
- error = table->proc_handler(table, write, filp, buf, &res, ppos);
+ error = table->proc_handler(table, write, buf, &res, ppos);
if (!error)
error = res;
out:
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f146..766b1d456050 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -4,13 +4,18 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/kernel_stat.h>
#include <asm/cputime.h>
static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec uptime;
struct timespec idle;
- cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
+ int i;
+ cputime_t idletime = cputime_zero;
+
+ for_each_possible_cpu(i)
+ idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
do_posix_clock_monotonic_gettime(&uptime);
monotonic_to_bootbased(&uptime);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 11f0c06316de..32fae4040ebf 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -69,14 +69,11 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* make various checks */
order = get_order(newsize);
if (unlikely(order >= MAX_ORDER))
- goto too_big;
+ return -EFBIG;
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && newsize > limit)
- goto fsize_exceeded;
-
- if (newsize > inode->i_sb->s_maxbytes)
- goto too_big;
+ ret = inode_newsize_ok(inode, newsize);
+ if (ret)
+ return ret;
i_size_write(inode, newsize);
@@ -118,12 +115,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
return 0;
- fsize_exceeded:
- send_sig(SIGXFSZ, current, 0);
- too_big:
- return -EFBIG;
-
- add_error:
+add_error:
while (loop < npages)
__free_page(pages + loop++);
return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 6c8c55dec2bc..3ac28987f22a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -839,9 +839,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
pos = *ppos;
- retval = -EINVAL;
- if (unlikely(pos < 0))
- goto fput_out;
if (unlikely(pos + count > max)) {
retval = -EOVERFLOW;
if (pos >= max)
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index b3208adf8e71..71e2b4d50a0a 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb,
#ifdef CONFIG_ROMFS_ON_MTD
if (sb->s_mtd)
- return romfs_mtd_strnlen(sb, pos, limit);
+ return romfs_mtd_strnlen(sb, pos, maxlen);
#endif
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev)
- return romfs_blk_strnlen(sb, pos, limit);
+ return romfs_blk_strnlen(sb, pos, maxlen);
#endif
return -EIO;
}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 47f132df0c3f..c117fa80d1e9 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
root = romfs_iget(sb, pos);
- if (!root)
+ if (IS_ERR(root))
goto error;
sb->s_root = d_alloc_root(root);
diff --git a/fs/select.c b/fs/select.c
index a201fc370223..fd38ce2e32e3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
*/
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/module.h>
#include <linux/slab.h>
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 6c959275f2d0..eae7d9dbf3ff 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path);
*/
int seq_path(struct seq_file *m, struct path *path, char *esc)
{
- if (m->count < m->size) {
- char *s = m->buf + m->count;
- char *p = d_path(path, s, m->size - m->count);
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int res = -1;
+
+ if (size) {
+ char *p = d_path(path, buf, size);
if (!IS_ERR(p)) {
- s = mangle_path(s, p, esc);
- if (s) {
- p = m->buf + m->count;
- m->count = s - m->buf;
- return s - p;
- }
+ char *end = mangle_path(buf, p, esc);
+ if (end)
+ res = end - buf;
}
}
- m->count = m->size;
- return -1;
+ seq_commit(m, res);
+
+ return res;
}
EXPORT_SYMBOL(seq_path);
@@ -454,26 +455,28 @@ EXPORT_SYMBOL(seq_path);
int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
char *esc)
{
- int err = -ENAMETOOLONG;
- if (m->count < m->size) {
- char *s = m->buf + m->count;
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int res = -ENAMETOOLONG;
+
+ if (size) {
char *p;
spin_lock(&dcache_lock);
- p = __d_path(path, root, s, m->size - m->count);
+ p = __d_path(path, root, buf, size);
spin_unlock(&dcache_lock);
- err = PTR_ERR(p);
+ res = PTR_ERR(p);
if (!IS_ERR(p)) {
- s = mangle_path(s, p, esc);
- if (s) {
- p = m->buf + m->count;
- m->count = s - m->buf;
- return 0;
- }
+ char *end = mangle_path(buf, p, esc);
+ if (end)
+ res = end - buf;
+ else
+ res = -ENAMETOOLONG;
}
}
- m->count = m->size;
- return err;
+ seq_commit(m, res);
+
+ return res < 0 ? res : 0;
}
/*
@@ -481,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
*/
int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
{
- if (m->count < m->size) {
- char *s = m->buf + m->count;
- char *p = dentry_path(dentry, s, m->size - m->count);
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int res = -1;
+
+ if (size) {
+ char *p = dentry_path(dentry, buf, size);
if (!IS_ERR(p)) {
- s = mangle_path(s, p, esc);
- if (s) {
- p = m->buf + m->count;
- m->count = s - m->buf;
- return s - p;
- }
+ char *end = mangle_path(buf, p, esc);
+ if (end)
+ res = end - buf;
}
}
- m->count = m->size;
- return -1;
+ seq_commit(m, res);
+
+ return res;
}
int seq_bitmap(struct seq_file *m, const unsigned long *bits,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1402d2d54f52..1c4c8f089970 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -459,14 +459,8 @@ smb_show_options(struct seq_file *s, struct vfsmount *m)
static void
smb_unload_nls(struct smb_sb_info *server)
{
- if (server->remote_nls) {
- unload_nls(server->remote_nls);
- server->remote_nls = NULL;
- }
- if (server->local_nls) {
- unload_nls(server->local_nls);
- server->local_nls = NULL;
- }
+ unload_nls(server->remote_nls);
+ unload_nls(server->local_nls);
}
static void
diff --git a/fs/super.c b/fs/super.c
index 0e7207b9815c..19eb70b374bc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -465,6 +465,48 @@ rescan:
}
EXPORT_SYMBOL(get_super);
+
+/**
+ * get_active_super - get an active reference to the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given. Returns the superblock with an active
+ * reference and s_umount held exclusively or %NULL if none was found.
+ */
+struct super_block *get_active_super(struct block_device *bdev)
+{
+ struct super_block *sb;
+
+ if (!bdev)
+ return NULL;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdev != bdev)
+ continue;
+
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ down_write(&sb->s_umount);
+ if (sb->s_root) {
+ spin_lock(&sb_lock);
+ if (sb->s_count > S_BIAS) {
+ atomic_inc(&sb->s_active);
+ sb->s_count--;
+ spin_unlock(&sb_lock);
+ return sb;
+ }
+ spin_unlock(&sb_lock);
+ }
+ up_write(&sb->s_umount);
+ put_super(sb);
+ yield();
+ spin_lock(&sb_lock);
+ }
+ spin_unlock(&sb_lock);
+ return NULL;
+}
struct super_block * user_get_super(dev_t dev)
{
@@ -527,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
int retval;
int remount_rw;
-
+
+ if (sb->s_frozen != SB_UNFROZEN)
+ return -EBUSY;
+
#ifdef CONFIG_BLOCK
if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
return -EACCES;
#endif
+
if (flags & MS_RDONLY)
acct_auto_close(sb);
shrink_dcache_sb(sb);
@@ -743,9 +789,14 @@ int get_sb_bdev(struct file_system_type *fs_type,
* will protect the lockfs code from trying to start a snapshot
* while we are mounting
*/
- down(&bdev->bd_mount_sem);
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+ if (bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ error = -EBUSY;
+ goto error_bdev;
+ }
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
- up(&bdev->bd_mount_sem);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (IS_ERR(s))
goto error_s;
@@ -892,6 +943,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
if (error)
goto out_sb;
+ /*
+ * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
+ * but s_maxbytes was an unsigned long long for many releases. Throw
+ * this warning for a little while to try and catch filesystems that
+ * violate this rule. This warning should be either removed or
+ * converted to a BUG() in 2.6.34.
+ */
+ WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+ "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
up_write(&mnt->mnt_sb->s_umount);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 2524714bece1..60c702bc10ae 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -40,7 +40,7 @@ struct bin_buffer {
struct mutex mutex;
void *buffer;
int mmapped;
- struct vm_operations_struct *vm_ops;
+ const struct vm_operations_struct *vm_ops;
struct file *file;
struct hlist_node list;
};
@@ -331,7 +331,7 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
}
#endif
-static struct vm_operations_struct bin_vm_ops = {
+static const struct vm_operations_struct bin_vm_ops = {
.open = bin_vma_open,
.close = bin_vma_close,
.fault = bin_fault,
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0050fc40e8c9..5fad489ce5bc 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -894,7 +894,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
mutex_lock(&sysfs_rename_mutex);
BUG_ON(!sd->s_parent);
- new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
+ new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
+ new_parent_kobj->sd : &sysfs_root;
error = 0;
if (sd->s_parent == new_parent_sd)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 561a9c050cef..f5ea4680f15f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
struct sysfs_open_dirent *od, *new_od = NULL;
retry:
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irq(&sysfs_open_dirent_lock);
if (!sd->s_attr.open && new_od) {
sd->s_attr.open = new_od;
@@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
list_add_tail(&buffer->list, &od->buffers);
}
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irq(&sysfs_open_dirent_lock);
if (od) {
kfree(new_od);
@@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
struct sysfs_buffer *buffer)
{
struct sysfs_open_dirent *od = sd->s_attr.open;
+ unsigned long flags;
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
list_del(&buffer->list);
if (atomic_dec_and_test(&od->refcnt))
@@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
else
od = NULL;
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
kfree(od);
}
@@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
void sysfs_notify_dirent(struct sysfs_dirent *sd)
{
struct sysfs_open_dirent *od;
+ unsigned long flags;
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
od = sd->s_attr.open;
if (od) {
@@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
wake_up_interruptible(&od->poll);
}
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
}
EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2e6481a7701c..1009adc8d602 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1534,7 +1534,7 @@ out_unlock:
return err;
}
-static struct vm_operations_struct ubifs_file_vm_ops = {
+static const struct vm_operations_struct ubifs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = ubifs_vm_page_mkwrite,
};
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d5e5559e31db..c2e30eea74dc 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -186,19 +186,37 @@ xfs_destroy_ioend(
}
/*
+ * If the end of the current ioend is beyond the current EOF,
+ * return the new EOF value, otherwise zero.
+ */
+STATIC xfs_fsize_t
+xfs_ioend_new_eof(
+ xfs_ioend_t *ioend)
+{
+ xfs_inode_t *ip = XFS_I(ioend->io_inode);
+ xfs_fsize_t isize;
+ xfs_fsize_t bsize;
+
+ bsize = ioend->io_offset + ioend->io_size;
+ isize = MAX(ip->i_size, ip->i_new_size);
+ isize = MIN(isize, bsize);
+ return isize > ip->i_d.di_size ? isize : 0;
+}
+
+/*
* Update on-disk file size now that data has been written to disk.
* The current in-memory file size is i_size. If a write is beyond
* eof i_new_size will be the intended file size until i_size is
* updated. If this write does not extend all the way to the valid
* file size then restrict this update to the end of the write.
*/
+
STATIC void
xfs_setfilesize(
xfs_ioend_t *ioend)
{
xfs_inode_t *ip = XFS_I(ioend->io_inode);
xfs_fsize_t isize;
- xfs_fsize_t bsize;
ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
ASSERT(ioend->io_type != IOMAP_READ);
@@ -206,16 +224,10 @@ xfs_setfilesize(
if (unlikely(ioend->io_error))
return;
- bsize = ioend->io_offset + ioend->io_size;
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- isize = MAX(ip->i_size, ip->i_new_size);
- isize = MIN(isize, bsize);
-
- if (ip->i_d.di_size < isize) {
+ isize = xfs_ioend_new_eof(ioend);
+ if (isize) {
ip->i_d.di_size = isize;
- ip->i_update_core = 1;
xfs_mark_inode_dirty_sync(ip);
}
@@ -404,10 +416,16 @@ xfs_submit_ioend_bio(
struct bio *bio)
{
atomic_inc(&ioend->io_remaining);
-
bio->bi_private = ioend;
bio->bi_end_io = xfs_end_bio;
+ /*
+ * If the I/O is beyond EOF we mark the inode dirty immediately
+ * but don't update the inode size until I/O completion.
+ */
+ if (xfs_ioend_new_eof(ioend))
+ xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+
submit_bio(WRITE, bio);
ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
bio_put(bio);
@@ -1635,4 +1653,5 @@ const struct address_space_operations xfs_address_space_operations = {
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 988d8f87bc0f..eff61e2732af 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -42,7 +42,7 @@
#include <linux/dcache.h>
-static struct vm_operations_struct xfs_file_vm_ops;
+static const struct vm_operations_struct xfs_file_vm_ops;
STATIC ssize_t
xfs_file_aio_read(
@@ -176,14 +176,7 @@ xfs_file_fsync(
struct dentry *dentry,
int datasync)
{
- struct inode *inode = dentry->d_inode;
- struct xfs_inode *ip = XFS_I(inode);
- int error;
-
- /* capture size updates in I/O completion before writing the inode. */
- error = filemap_fdatawait(inode->i_mapping);
- if (error)
- return error;
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
xfs_iflags_clear(ip, XFS_ITRUNCATED);
return -xfs_fsync(ip);
@@ -280,7 +273,7 @@ const struct file_operations xfs_dir_file_operations = {
.fsync = xfs_file_fsync,
};
-static struct vm_operations_struct xfs_file_vm_ops = {
+static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = xfs_vm_page_mkwrite,
};
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index da0159d99f82..cd42ef78f6b5 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -57,19 +57,22 @@
#include <linux/fiemap.h>
/*
- * Bring the atime in the XFS inode uptodate.
- * Used before logging the inode to disk or when the Linux inode goes away.
+ * Bring the timestamps in the XFS inode uptodate.
+ *
+ * Used before writing the inode to disk.
*/
void
-xfs_synchronize_atime(
+xfs_synchronize_times(
xfs_inode_t *ip)
{
struct inode *inode = VFS_I(ip);
- if (!(inode->i_state & I_CLEAR)) {
- ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
- }
+ ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+ ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+ ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
}
/*
@@ -106,32 +109,20 @@ xfs_ichgtime(
if ((flags & XFS_ICHGTIME_MOD) &&
!timespec_equal(&inode->i_mtime, &tv)) {
inode->i_mtime = tv;
- ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
sync_it = 1;
}
if ((flags & XFS_ICHGTIME_CHG) &&
!timespec_equal(&inode->i_ctime, &tv)) {
inode->i_ctime = tv;
- ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
sync_it = 1;
}
/*
- * We update the i_update_core field _after_ changing
- * the timestamps in order to coordinate properly with
- * xfs_iflush() so that we don't lose timestamp updates.
- * This keeps us from having to hold the inode lock
- * while doing this. We use the SYNCHRONIZE macro to
- * ensure that the compiler does not reorder the update
- * of i_update_core above the timestamp updates above.
+ * Update complete - now make sure everyone knows that the inode
+ * is dirty.
*/
- if (sync_it) {
- SYNCHRONIZE();
- ip->i_update_core = 1;
+ if (sync_it)
xfs_mark_inode_dirty_sync(ip);
- }
}
/*
@@ -506,10 +497,8 @@ xfs_vn_getattr(
stat->gid = ip->i_d.di_gid;
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
- stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
- stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
- stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec;
- stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
stat->blocks =
XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 49e4a6aea73c..072050f8d346 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -667,7 +667,7 @@ start:
xip->i_new_size = new_size;
if (likely(!(ioflags & IO_INVIS)))
- xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ file_update_time(file);
/*
* If the offset is beyond the size of the file, we have a couple
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bdd41c8c342f..18a4b8e11df2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -977,6 +977,28 @@ xfs_fs_inode_init_once(
}
/*
+ * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
+ * we catch unlogged VFS level updates to the inode. Care must be taken
+ * here - the transaction code calls mark_inode_dirty_sync() to mark the
+ * VFS inode dirty in a transaction and clears the i_update_core field;
+ * it must clear the field after calling mark_inode_dirty_sync() to
+ * correctly indicate that the dirty state has been propagated into the
+ * inode log item.
+ *
+ * We need the barrier() to maintain correct ordering between unlogged
+ * updates and the transaction commit code that clears the i_update_core
+ * field. This requires all updates to be completed before marking the
+ * inode dirty.
+ */
+STATIC void
+xfs_fs_dirty_inode(
+ struct inode *inode)
+{
+ barrier();
+ XFS_I(inode)->i_update_core = 1;
+}
+
+/*
* Attempt to flush the inode, this will actually fail
* if the inode is pinned, but we dirty the inode again
* at the point when it is unpinned after a log write,
@@ -1126,7 +1148,7 @@ xfs_fs_put_super(
}
STATIC int
-xfs_fs_sync_super(
+xfs_fs_sync_fs(
struct super_block *sb,
int wait)
{
@@ -1134,23 +1156,23 @@ xfs_fs_sync_super(
int error;
/*
- * Treat a sync operation like a freeze. This is to work
- * around a race in sync_inodes() which works in two phases
- * - an asynchronous flush, which can write out an inode
- * without waiting for file size updates to complete, and a
- * synchronous flush, which wont do anything because the
- * async flush removed the inode's dirty flag. Also
- * sync_inodes() will not see any files that just have
- * outstanding transactions to be flushed because we don't
- * dirty the Linux inode until after the transaction I/O
- * completes.
+ * Not much we can do for the first async pass. Writing out the
+ * superblock would be counter-productive as we are going to redirty
+ * when writing out other data and metadata (and writing out a single
+ * block is quite fast anyway).
+ *
+ * Try to asynchronously kick off quota syncing at least.
*/
- if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
- error = xfs_quiesce_data(mp);
- else
- error = xfs_sync_fsdata(mp, 0);
+ if (!wait) {
+ xfs_qm_sync(mp, SYNC_TRYLOCK);
+ return 0;
+ }
+
+ error = xfs_quiesce_data(mp);
+ if (error)
+ return -error;
- if (unlikely(laptop_mode)) {
+ if (laptop_mode) {
int prev_sync_seq = mp->m_sync_seq;
/*
@@ -1169,7 +1191,7 @@ xfs_fs_sync_super(
mp->m_sync_seq != prev_sync_seq);
}
- return -error;
+ return 0;
}
STATIC int
@@ -1539,10 +1561,11 @@ xfs_fs_get_sb(
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
+ .dirty_inode = xfs_fs_dirty_inode,
.write_inode = xfs_fs_write_inode,
.clear_inode = xfs_fs_clear_inode,
.put_super = xfs_fs_put_super,
- .sync_fs = xfs_fs_sync_super,
+ .sync_fs = xfs_fs_sync_fs,
.freeze_fs = xfs_fs_freeze,
.statfs = xfs_fs_statfs,
.remount_fs = xfs_fs_remount,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 320be6aea492..961df0a22c78 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -309,11 +309,15 @@ xfs_sync_attr(
STATIC int
xfs_commit_dummy_trans(
struct xfs_mount *mp,
- uint log_flags)
+ uint flags)
{
struct xfs_inode *ip = mp->m_rootip;
struct xfs_trans *tp;
int error;
+ int log_flags = XFS_LOG_FORCE;
+
+ if (flags & SYNC_WAIT)
+ log_flags |= XFS_LOG_SYNC;
/*
* Put a dummy transaction in the log to tell recovery
@@ -331,13 +335,12 @@ xfs_commit_dummy_trans(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /* XXX(hch): ignoring the error here.. */
error = xfs_trans_commit(tp, 0);
-
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* the log force ensures this transaction is pushed to disk */
xfs_log_force(mp, 0, log_flags);
- return 0;
+ return error;
}
int
@@ -385,7 +388,20 @@ xfs_sync_fsdata(
else
XFS_BUF_ASYNC(bp);
- return xfs_bwrite(mp, bp);
+ error = xfs_bwrite(mp, bp);
+ if (error)
+ return error;
+
+ /*
+ * If this is a data integrity sync make sure all pending buffers
+ * are flushed out for the log coverage check below.
+ */
+ if (flags & SYNC_WAIT)
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+ if (xfs_log_need_covered(mp))
+ error = xfs_commit_dummy_trans(mp, flags);
+ return error;
out_brelse:
xfs_buf_relse(bp);
@@ -419,14 +435,16 @@ xfs_quiesce_data(
/* push non-blocking */
xfs_sync_data(mp, 0);
xfs_qm_sync(mp, SYNC_TRYLOCK);
- xfs_filestream_flush(mp);
- /* push and block */
+ /* push and block till complete */
xfs_sync_data(mp, SYNC_WAIT);
xfs_qm_sync(mp, SYNC_WAIT);
+ /* drop inode references pinned by filestreams */
+ xfs_filestream_flush(mp);
+
/* write superblock and hoover up shutdown errors */
- error = xfs_sync_fsdata(mp, 0);
+ error = xfs_sync_fsdata(mp, SYNC_WAIT);
/* flush data-only devices */
if (mp->m_rtdev_targp)
@@ -570,8 +588,6 @@ xfs_sync_worker(
/* dgc: errors ignored here */
error = xfs_qm_sync(mp, SYNC_TRYLOCK);
error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
- if (xfs_log_need_covered(mp))
- error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
}
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb6083..c5bc67c4e3bb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
xfs_stats_clear_proc_handler(
ctl_table *ctl,
int write,
- struct file *filp,
void __user *buffer,
size_t *lenp,
loff_t *ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
int c, ret, *valp = ctl->data;
__uint32_t vn_active;
- ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
if (!ret && write && *valp) {
printk("XFS Clearing xfsstats\n");
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7465f9ee125f..ab89a7e94a0f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -206,10 +206,10 @@ xfs_swap_extents(
* process that the file was not changed out from
* under it.
*/
- if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
- (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
- (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
- (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
+ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
error = XFS_ERROR(EBUSY);
goto out_unlock;
}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index fa913e459442..41ad537c49e9 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents(
*/
ra_want = howmany(bufsize + mp->m_dirblksize,
mp->m_sb.sb_blocksize) - 1;
+ ASSERT(ra_want >= 0);
/*
* If we don't have as many as we want, and we haven't
@@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents(
*/
ptr += length;
curoff += length;
- bufsize -= length;
+ /* bufsize may have just been a guess; don't go negative */
+ bufsize = bufsize > length ? bufsize - length : 0;
}
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c1dc7ef5a1d8..b92a4fa2a0a1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3068,9 +3068,9 @@ xfs_iflush_int(
SYNCHRONIZE();
/*
- * Make sure to get the latest atime from the Linux inode.
+ * Make sure to get the latest timestamps from the Linux inode.
*/
- xfs_synchronize_atime(ip);
+ xfs_synchronize_times(ip);
if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0b38b9a869ec..41555de1d1db 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
-void xfs_synchronize_atime(xfs_inode_t *);
+void xfs_synchronize_times(xfs_inode_t *);
void xfs_mark_inode_dirty_sync(xfs_inode_t *);
#if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 47d5b663c37e..9794b876d6ff 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -232,6 +232,15 @@ xfs_inode_item_format(
nvecs = 1;
/*
+ * Make sure the linux inode is dirty. We do this before
+ * clearing i_update_core as the VFS will call back into
+ * XFS here and set i_update_core, so we need to dirty the
+ * inode first so that the ordering of i_update_core and
+ * unlogged modifications still works as described below.
+ */
+ xfs_mark_inode_dirty_sync(ip);
+
+ /*
* Clear i_update_core if the timestamps (or any other
* non-transactional modification) need flushing/logging
* and we're about to log them with the rest of the core.
@@ -263,14 +272,9 @@ xfs_inode_item_format(
}
/*
- * Make sure to get the latest atime from the Linux inode.
+ * Make sure to get the latest timestamps from the Linux inode.
*/
- xfs_synchronize_atime(ip);
-
- /*
- * make sure the linux inode is dirty
- */
- xfs_mark_inode_dirty_sync(ip);
+ xfs_synchronize_times(ip);
vecp->i_addr = (xfs_caddr_t)&ip->i_d;
vecp->i_len = sizeof(struct xfs_icdinode);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b68f9107e26c..62efab2f3839 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,6 +59,7 @@ xfs_bulkstat_one_iget(
{
xfs_icdinode_t *dic; /* dinode core info pointer */
xfs_inode_t *ip; /* incore inode pointer */
+ struct inode *inode;
int error;
error = xfs_iget(mp, NULL, ino,
@@ -72,6 +73,7 @@ xfs_bulkstat_one_iget(
ASSERT(ip->i_imap.im_blkno != 0);
dic = &ip->i_d;
+ inode = VFS_I(ip);
/* xfs_iget returns the following without needing
* further change.
@@ -83,16 +85,19 @@ xfs_bulkstat_one_iget(
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
+
/*
- * We are reading the atime from the Linux inode because the
- * dinode might not be uptodate.
+ * We need to read the timestamps from the Linux inode because
+ * the VFS keeps writing directly into the inode structure instead
+ * of telling us about the updates.
*/
- buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
- buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
- buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
- buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
- buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
- buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+ buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+ buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+ buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+ buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+ buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+ buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a434f287962d..b572f7e840e0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2476,12 +2476,6 @@ xfs_reclaim(
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
/*
- * Make sure the atime in the XFS inode is correct before freeing the
- * Linux inode.
- */
- xfs_synchronize_atime(ip);
-
- /*
* If we have nothing to flush with this inode then complete the
* teardown now, otherwise break the link between the xfs inode and the
* linux inode and clean up the xfs inode later. This avoids flushing