summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bfs/bfs.h11
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/bfs/file.c2
-rw-r--r--fs/bfs/inode.c65
-rw-r--r--fs/binfmt_script.c10
-rw-r--r--fs/buffer.c50
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/eventpoll.c218
-rw-r--r--fs/exec.c103
-rw-r--r--fs/file_table.c7
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/hfsplus/dir.c1
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c21
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ocfs2/Makefile2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c17
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c3
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/move_extents.c47
-rw-r--r--fs/proc/base.c14
-rw-r--r--fs/proc/task_mmu.c14
-rw-r--r--fs/proc/util.c1
-rw-r--r--fs/userfaultfd.c11
28 files changed, 368 insertions, 252 deletions
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 67aef3bb89e4..606f9378b2f0 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -1,13 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* fs/bfs/bfs.h
- * Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
*/
#ifndef _FS_BFS_BFS_H
#define _FS_BFS_BFS_H
#include <linux/bfs_fs.h>
+/* In theory BFS supports up to 512 inodes, numbered from 2 (for /) up to 513 inclusive.
+ In actual fact, attempting to create the 512th inode (i.e. inode No. 513 or file No. 511)
+ will fail with ENOSPC in bfs_add_entry(): the root directory cannot contain so many entries, counting '..'.
+ So, mkfs.bfs(8) should really limit its -N option to 511 and not 512. For now, we just print a warning
+ if a filesystem is mounted with such "impossible to fill up" number of inodes */
+#define BFS_MAX_LASTI 513
+
/*
* BFS file system in-core superblock info
*/
@@ -17,7 +24,7 @@ struct bfs_sb_info {
unsigned long si_freei;
unsigned long si_lf_eblk;
unsigned long si_lasti;
- unsigned long *si_imap;
+ DECLARE_BITMAP(si_imap, BFS_MAX_LASTI+1);
struct mutex bfs_lock;
};
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index f32f21c3bbc7..d8dfe3a0cb39 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -2,8 +2,8 @@
/*
* fs/bfs/dir.c
* BFS directory operations.
- * Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
- * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
*/
#include <linux/time.h>
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 1476cdd90cfb..0dceefc54b48 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -2,7 +2,7 @@
/*
* fs/bfs/file.c
* BFS file operations.
- * Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
*
* Make the file block allocation algorithm understand the size
* of the underlying block device.
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f7ef2913bd9d..ea2055d78858 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -1,10 +1,9 @@
/*
* fs/bfs/inode.c
* BFS superblock and inode operations.
- * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com>
* From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
- *
- * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
+ * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
*/
#include <linux/module.h>
@@ -118,12 +117,12 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct bfs_sb_info *info = BFS_SB(inode->i_sb);
unsigned int ino = (u16)inode->i_ino;
- unsigned long i_sblock;
+ unsigned long i_sblock;
struct bfs_inode *di;
struct buffer_head *bh;
int err = 0;
- dprintf("ino=%08x\n", ino);
+ dprintf("ino=%08x\n", ino);
di = find_inode(inode->i_sb, ino, &bh);
if (IS_ERR(di))
@@ -144,7 +143,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
- i_sblock = BFS_I(inode)->i_sblock;
+ i_sblock = BFS_I(inode)->i_sblock;
di->i_sblock = cpu_to_le32(i_sblock);
di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
@@ -188,13 +187,13 @@ static void bfs_evict_inode(struct inode *inode)
mark_buffer_dirty(bh);
brelse(bh);
- if (bi->i_dsk_ino) {
+ if (bi->i_dsk_ino) {
if (bi->i_sblock)
info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
info->si_freei++;
clear_bit(ino, info->si_imap);
- bfs_dump_imap("delete_inode", s);
- }
+ bfs_dump_imap("evict_inode", s);
+ }
/*
* If this was the last file, make the previous block
@@ -214,7 +213,6 @@ static void bfs_put_super(struct super_block *s)
return;
mutex_destroy(&info->bfs_lock);
- kfree(info->si_imap);
kfree(info);
s->s_fs_info = NULL;
}
@@ -311,8 +309,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s)
else
strcat(tmpbuf, "0");
}
- printf("BFS-fs: %s: lasti=%08lx <%s>\n",
- prefix, BFS_SB(s)->si_lasti, tmpbuf);
+ printf("%s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf);
free_page((unsigned long)tmpbuf);
#endif
}
@@ -323,7 +320,7 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
struct buffer_head *bh, *sbh;
struct bfs_super_block *bfs_sb;
struct inode *inode;
- unsigned i, imap_len;
+ unsigned i;
struct bfs_sb_info *info;
int ret = -EINVAL;
unsigned long i_sblock, i_eblock, i_eoff, s_size;
@@ -342,8 +339,7 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
bfs_sb = (struct bfs_super_block *)sbh->b_data;
if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
if (!silent)
- printf("No BFS filesystem on %s (magic=%08x)\n",
- s->s_id, le32_to_cpu(bfs_sb->s_magic));
+ printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic));
goto out1;
}
if (BFS_UNCLEAN(bfs_sb, s) && !silent)
@@ -352,18 +348,16 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
s->s_magic = BFS_MAGIC;
if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) ||
- le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) {
- printf("Superblock is corrupted\n");
+ le32_to_cpu(bfs_sb->s_start) < sizeof(struct bfs_super_block) + sizeof(struct bfs_dirent)) {
+ printf("Superblock is corrupted on %s\n", s->s_id);
goto out1;
}
- info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
- sizeof(struct bfs_inode)
- + BFS_ROOT_INO - 1;
- imap_len = (info->si_lasti / 8) + 1;
- info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN);
- if (!info->si_imap) {
- printf("Cannot allocate %u bytes\n", imap_len);
+ info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1;
+ if (info->si_lasti == BFS_MAX_LASTI)
+ printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id);
+ else if (info->si_lasti > BFS_MAX_LASTI) {
+ printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id);
goto out1;
}
for (i = 0; i < BFS_ROOT_INO; i++)
@@ -373,26 +367,25 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
inode = bfs_iget(s, BFS_ROOT_INO);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- goto out2;
+ goto out1;
}
s->s_root = d_make_root(inode);
if (!s->s_root) {
ret = -ENOMEM;
- goto out2;
+ goto out1;
}
info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
- info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
- - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
+ info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
info->si_freei = 0;
info->si_lf_eblk = 0;
/* can we read the last block? */
bh = sb_bread(s, info->si_blocks - 1);
if (!bh) {
- printf("Last block not available: %lu\n", info->si_blocks - 1);
+ printf("Last block not available on %s: %lu\n", s->s_id, info->si_blocks - 1);
ret = -EIO;
- goto out3;
+ goto out2;
}
brelse(bh);
@@ -426,11 +419,11 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
(i_eoff != le32_to_cpu(-1) && i_eoff > s_size) ||
i_sblock * BFS_BSIZE > i_eoff) {
- printf("Inode 0x%08x corrupted\n", i);
+ printf("Inode 0x%08x corrupted on %s\n", i, s->s_id);
brelse(bh);
ret = -EIO;
- goto out3;
+ goto out2;
}
if (!di->i_ino) {
@@ -446,14 +439,12 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
}
brelse(bh);
brelse(sbh);
- bfs_dump_imap("read_super", s);
+ bfs_dump_imap("fill_super", s);
return 0;
-out3:
+out2:
dput(s->s_root);
s->s_root = NULL;
-out2:
- kfree(info->si_imap);
out1:
brelse(sbh);
out:
@@ -484,7 +475,7 @@ static int __init init_bfs_fs(void)
int err = init_inodecache();
if (err)
goto out1;
- err = register_filesystem(&bfs_fs_type);
+ err = register_filesystem(&bfs_fs_type);
if (err)
goto out;
return 0;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 7cde3f46ad26..d0078cbb718b 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -42,10 +42,14 @@ static int load_script(struct linux_binprm *bprm)
fput(bprm->file);
bprm->file = NULL;
- bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
- if ((cp = strchr(bprm->buf, '\n')) == NULL)
- cp = bprm->buf+BINPRM_BUF_SIZE-1;
+ for (cp = bprm->buf+2;; cp++) {
+ if (cp >= bprm->buf + BINPRM_BUF_SIZE)
+ return -ENOEXEC;
+ if (!*cp || (*cp == '\n'))
+ break;
+ }
*cp = '\0';
+
while (cp > bprm->buf) {
cp--;
if ((*cp == ' ') || (*cp == '\t'))
diff --git a/fs/buffer.c b/fs/buffer.c
index 1286c2b95498..3618ce2c4541 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -954,10 +954,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -977,6 +987,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1032,6 +1045,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1043,6 +1062,18 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3215,6 +3246,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3253,11 +3289,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3281,6 +3324,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c005a5400f2e..9a2d86191793 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -808,7 +808,7 @@ static inline int default_congestion_kb(void)
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
- congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
if (congestion_kb > 256*1024)
congestion_kb = 256*1024;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 42bbe6824b4b..0627454298d6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -381,7 +381,8 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
*/
static inline int ep_events_available(struct eventpoll *ep)
{
- return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+ return !list_empty_careful(&ep->rdllist) ||
+ READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -471,7 +472,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
* no re-entered.
*
* @ncalls: Pointer to the nested_calls structure to be used for this call.
- * @max_nests: Maximum number of allowed nesting calls.
* @nproc: Nested call core function pointer.
* @priv: Opaque data to be passed to the @nproc callback.
* @cookie: Cookie to be used to identify this nested call.
@@ -480,7 +480,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
* Returns: Returns the code returned by the @nproc callback, or -1 if
* the maximum recursion limit has been exceeded.
*/
-static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+static int ep_call_nested(struct nested_calls *ncalls,
int (*nproc)(void *, void *, int), void *priv,
void *cookie, void *ctx)
{
@@ -499,7 +499,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
*/
list_for_each_entry(tncur, lsthead, llink) {
if (tncur->ctx == ctx &&
- (tncur->cookie == cookie || ++call_nests > max_nests)) {
+ (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
/*
* Ops ... loop detected or maximum nest level reached.
* We abort this wake by breaking the cycle itself.
@@ -573,7 +573,7 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
{
int this_cpu = get_cpu();
- ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+ ep_call_nested(&poll_safewake_ncalls,
ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
put_cpu();
@@ -699,7 +699,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
*/
spin_lock_irq(&ep->wq.lock);
list_splice_init(&ep->rdllist, &txlist);
- ep->ovflist = NULL;
+ WRITE_ONCE(ep->ovflist, NULL);
spin_unlock_irq(&ep->wq.lock);
/*
@@ -713,7 +713,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
- for (nepi = ep->ovflist; (epi = nepi) != NULL;
+ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
@@ -731,7 +731,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* releasing the lock, events will be queued in the normal way inside
* ep->rdllist.
*/
- ep->ovflist = EP_UNACTIVE_PTR;
+ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
/*
* Quickly re-inject items left on "txlist".
@@ -1154,10 +1154,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
- if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = ep->ovflist;
- ep->ovflist = epi;
+ epi->next = READ_ONCE(ep->ovflist);
+ WRITE_ONCE(ep->ovflist, epi);
if (epi->ws) {
/*
* Activate ep->ws since epi->ws may get
@@ -1333,7 +1333,6 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
}
} else {
error = ep_call_nested(&poll_loop_ncalls,
- EP_MAX_NESTS,
reverse_path_check_proc,
child_file, child_file,
current);
@@ -1367,7 +1366,7 @@ static int reverse_path_check(void)
/* let's call this for all tfiles */
list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
path_count_init();
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ error = ep_call_nested(&poll_loop_ncalls,
reverse_path_check_proc, current_file,
current_file, current);
if (error)
@@ -1626,21 +1625,24 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
{
struct ep_send_events_data *esed = priv;
__poll_t revents;
- struct epitem *epi;
- struct epoll_event __user *uevent;
+ struct epitem *epi, *tmp;
+ struct epoll_event __user *uevent = esed->events;
struct wakeup_source *ws;
poll_table pt;
init_poll_funcptr(&pt, NULL);
+ esed->res = 0;
/*
* We can loop without lock because we are passed a task private list.
* Items cannot vanish during the loop because ep_scan_ready_list() is
* holding "mtx" during this call.
*/
- for (esed->res = 0, uevent = esed->events;
- !list_empty(head) && esed->res < esed->maxevents;) {
- epi = list_first_entry(head, struct epitem, rdllink);
+ lockdep_assert_held(&ep->mtx);
+
+ list_for_each_entry_safe(epi, tmp, head, rdllink) {
+ if (esed->res >= esed->maxevents)
+ break;
/*
* Activate ep->ws before deactivating epi->ws to prevent
@@ -1660,42 +1662,42 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
list_del_init(&epi->rdllink);
- revents = ep_item_poll(epi, &pt, 1);
-
/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, ep_scan_ready_list()
- * is holding "mtx", so no operations coming from userspace
+ * is holding ep->mtx, so no operations coming from userspace
* can change the item.
*/
- if (revents) {
- if (__put_user(revents, &uevent->events) ||
- __put_user(epi->event.data, &uevent->data)) {
- list_add(&epi->rdllink, head);
- ep_pm_stay_awake(epi);
- if (!esed->res)
- esed->res = -EFAULT;
- return 0;
- }
- esed->res++;
- uevent++;
- if (epi->event.events & EPOLLONESHOT)
- epi->event.events &= EP_PRIVATE_BITS;
- else if (!(epi->event.events & EPOLLET)) {
- /*
- * If this file has been added with Level
- * Trigger mode, we need to insert back inside
- * the ready list, so that the next call to
- * epoll_wait() will check again the events
- * availability. At this point, no one can insert
- * into ep->rdllist besides us. The epoll_ctl()
- * callers are locked out by
- * ep_scan_ready_list() holding "mtx" and the
- * poll callback will queue them in ep->ovflist.
- */
- list_add_tail(&epi->rdllink, &ep->rdllist);
- ep_pm_stay_awake(epi);
- }
+ revents = ep_item_poll(epi, &pt, 1);
+ if (!revents)
+ continue;
+
+ if (__put_user(revents, &uevent->events) ||
+ __put_user(epi->event.data, &uevent->data)) {
+ list_add(&epi->rdllink, head);
+ ep_pm_stay_awake(epi);
+ if (!esed->res)
+ esed->res = -EFAULT;
+ return 0;
+ }
+ esed->res++;
+ uevent++;
+ if (epi->event.events & EPOLLONESHOT)
+ epi->event.events &= EP_PRIVATE_BITS;
+ else if (!(epi->event.events & EPOLLET)) {
+ /*
+ * If this file has been added with Level
+ * Trigger mode, we need to insert back inside
+ * the ready list, so that the next call to
+ * epoll_wait() will check again the events
+ * availability. At this point, no one can insert
+ * into ep->rdllist besides us. The epoll_ctl()
+ * callers are locked out by
+ * ep_scan_ready_list() holding "mtx" and the
+ * poll callback will queue them in ep->ovflist.
+ */
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake(epi);
}
}
@@ -1747,6 +1749,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
+ bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@@ -1761,11 +1764,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
- * caller specified a non blocking operation.
+ * caller specified a non blocking operation. We still need
+ * lock because we could race and not see an epi being added
+ * to the ready list while in irq callback. Thus incorrectly
+ * returning 0 back to userspace.
*/
timed_out = 1;
+
spin_lock_irq(&ep->wq.lock);
- goto check_events;
+ eavail = ep_events_available(ep);
+ spin_unlock_irq(&ep->wq.lock);
+
+ goto send_events;
}
fetch_events:
@@ -1773,64 +1783,66 @@ fetch_events:
if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);
- spin_lock_irq(&ep->wq.lock);
+ eavail = ep_events_available(ep);
+ if (eavail)
+ goto send_events;
- if (!ep_events_available(ep)) {
- /*
- * Busy poll timed out. Drop NAPI ID for now, we can add
- * it back in when we have moved a socket with a valid NAPI
- * ID onto the ready list.
- */
- ep_reset_busy_poll_napi_id(ep);
+ /*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
+ ep_reset_busy_poll_napi_id(ep);
- /*
- * We don't have any available event to return to the caller.
- * We need to sleep here, and we will be wake up by
- * ep_poll_callback() when events will become available.
- */
+ /*
+ * We don't have any available event to return to the caller. We need
+ * to sleep here, and we will be woken by ep_poll_callback() when events
+ * become available.
+ */
+ if (!waiter) {
+ waiter = true;
init_waitqueue_entry(&wait, current);
- __add_wait_queue_exclusive(&ep->wq, &wait);
- for (;;) {
- /*
- * We don't want to sleep if the ep_poll_callback() sends us
- * a wakeup in between. That's why we set the task state
- * to TASK_INTERRUPTIBLE before doing the checks.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
- */
- if (fatal_signal_pending(current)) {
- res = -EINTR;
- break;
- }
- if (ep_events_available(ep) || timed_out)
- break;
- if (signal_pending(current)) {
- res = -EINTR;
- break;
- }
+ spin_lock_irq(&ep->wq.lock);
+ __add_wait_queue_exclusive(&ep->wq, &wait);
+ spin_unlock_irq(&ep->wq.lock);
+ }
- spin_unlock_irq(&ep->wq.lock);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
- timed_out = 1;
+ for (;;) {
+ /*
+ * We don't want to sleep if the ep_poll_callback() sends us
+ * a wakeup in between. That's why we set the task state
+ * to TASK_INTERRUPTIBLE before doing the checks.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ /*
+ * Always short-circuit for fatal signals to allow
+ * threads to make a timely exit without the chance of
+ * finding more events available and fetching
+ * repeatedly.
+ */
+ if (fatal_signal_pending(current)) {
+ res = -EINTR;
+ break;
+ }
- spin_lock_irq(&ep->wq.lock);
+ eavail = ep_events_available(ep);
+ if (eavail)
+ break;
+ if (signal_pending(current)) {
+ res = -EINTR;
+ break;
}
- __remove_wait_queue(&ep->wq, &wait);
- __set_current_state(TASK_RUNNING);
+ if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+ timed_out = 1;
+ break;
+ }
}
-check_events:
- /* Is it worth to try to dig for events ? */
- eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ __set_current_state(TASK_RUNNING);
+send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
@@ -1840,6 +1852,12 @@ check_events:
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
+ if (waiter) {
+ spin_lock_irq(&ep->wq.lock);
+ __remove_wait_queue(&ep->wq, &wait);
+ spin_unlock_irq(&ep->wq.lock);
+ }
+
return res;
}
@@ -1876,7 +1894,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit->visited)
continue;
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ error = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, epi->ffd.file,
ep_tovisit, current);
if (error != 0)
@@ -1916,7 +1934,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
int ret;
struct eventpoll *ep_cur, *ep_next;
- ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ret = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, file, ep, current);
/* clear visited list */
list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
diff --git a/fs/exec.c b/fs/exec.c
index acc3a5536384..c1f118898242 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -219,55 +219,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
if (ret <= 0)
return NULL;
- if (write) {
- unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
- unsigned long ptr_size, limit;
-
- /*
- * Since the stack will hold pointers to the strings, we
- * must account for them as well.
- *
- * The size calculation is the entire vma while each arg page is
- * built, so each time we get here it's calculating how far it
- * is currently (rather than each call being just the newly
- * added size from the arg page). As a result, we need to
- * always add the entire size of the pointers, so that on the
- * last call to get_arg_page() we'll actually have the entire
- * correct size.
- */
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
- if (ptr_size > ULONG_MAX - size)
- goto fail;
- size += ptr_size;
-
- acct_arg_size(bprm, size / PAGE_SIZE);
-
- /*
- * We've historically supported up to 32 pages (ARG_MAX)
- * of argument strings even with small stacks
- */
- if (size <= ARG_MAX)
- return page;
-
- /*
- * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
- * (whichever is smaller) for the argv+env strings.
- * This ensures that:
- * - the remaining binfmt code will not run out of stack space,
- * - the program will have a reasonable amount of stack left
- * to work from.
- */
- limit = _STK_LIM / 4 * 3;
- limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
- if (size > limit)
- goto fail;
- }
+ if (write)
+ acct_arg_size(bprm, vma_pages(bprm->vma));
return page;
-
-fail:
- put_page(page);
- return NULL;
}
static void put_arg_page(struct page *page)
@@ -493,6 +448,50 @@ static int count(struct user_arg_ptr argv, int max)
return i;
}
+static int prepare_arg_pages(struct linux_binprm *bprm,
+ struct user_arg_ptr argv, struct user_arg_ptr envp)
+{
+ unsigned long limit, ptr_size;
+
+ bprm->argc = count(argv, MAX_ARG_STRINGS);
+ if (bprm->argc < 0)
+ return bprm->argc;
+
+ bprm->envc = count(envp, MAX_ARG_STRINGS);
+ if (bprm->envc < 0)
+ return bprm->envc;
+
+ /*
+ * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
+ * (whichever is smaller) for the argv+env strings.
+ * This ensures that:
+ * - the remaining binfmt code will not run out of stack space,
+ * - the program will have a reasonable amount of stack left
+ * to work from.
+ */
+ limit = _STK_LIM / 4 * 3;
+ limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
+ /*
+ * We've historically supported up to 32 pages (ARG_MAX)
+ * of argument strings even with small stacks
+ */
+ limit = max_t(unsigned long, limit, ARG_MAX);
+ /*
+ * We must account for the size of all the argv and envp pointers to
+ * the argv and envp strings, since they will also take up space in
+ * the stack. They aren't stored until much later when we can't
+ * signal to the parent that the child has run out of stack space.
+ * Instead, calculate it here so it's possible to fail gracefully.
+ */
+ ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ if (limit <= ptr_size)
+ return -E2BIG;
+ limit -= ptr_size;
+
+ bprm->argmin = bprm->p - limit;
+ return 0;
+}
+
/*
* 'copy_strings()' copies argument/environment strings from the old
* processes's memory to the new process's stack. The call to get_user_pages()
@@ -528,6 +527,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
pos = bprm->p;
str += len;
bprm->p -= len;
+ if (bprm->p < bprm->argmin)
+ goto out;
while (len > 0) {
int offset, bytes_to_copy;
@@ -1790,12 +1791,8 @@ static int __do_execve_file(int fd, struct filename *filename,
if (retval)
goto out_unmark;
- bprm->argc = count(argv, MAX_ARG_STRINGS);
- if ((retval = bprm->argc) < 0)
- goto out;
-
- bprm->envc = count(envp, MAX_ARG_STRINGS);
- if ((retval = bprm->envc) < 0)
+ retval = prepare_arg_pages(bprm, argv, envp);
+ if (retval < 0)
goto out;
retval = prepare_binprm(bprm);
diff --git a/fs/file_table.c b/fs/file_table.c
index e03c8d121c6c..10e0a3dcea4d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -383,10 +383,11 @@ void __init files_init(void)
void __init files_maxfiles_init(void)
{
unsigned long n;
- unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+ unsigned long nr_pages = totalram_pages();
+ unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
- memreserve = min(memreserve, totalram_pages - 1);
- n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
+ memreserve = min(memreserve, nr_pages - 1);
+ n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4727ef612019..f0fe74fa4c85 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -825,7 +825,7 @@ static const struct super_operations fuse_super_operations = {
static void sanitize_global_limit(unsigned *limit)
{
if (*limit == 0)
- *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
+ *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) /
sizeof(struct fuse_req);
if (*limit >= 1 << 16)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f37662675c3a..29a9dcfbe81f 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -565,6 +565,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
.symlink = hfsplus_symlink,
.mknod = hfsplus_mknod,
.rename = hfsplus_rename,
+ .getattr = hfsplus_getattr,
.listxattr = hfsplus_listxattr,
};
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dd7ad9f13e3a..b8471bf05def 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -488,6 +488,8 @@ void hfsplus_inode_write_fork(struct inode *inode,
struct hfsplus_fork_raw *fork);
int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd);
int hfsplus_cat_write_inode(struct inode *inode);
+int hfsplus_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags);
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d7ab9d8c4b67..d131c8ea7eb6 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -270,6 +270,26 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
+int hfsplus_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+
+ if (inode->i_flags & S_APPEND)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (inode->i_flags & S_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (hip->userflags & HFSPLUS_FLG_NODUMP)
+ stat->attributes |= STATX_ATTR_NODUMP;
+
+ stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP;
+
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
@@ -329,6 +349,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
static const struct inode_operations hfsplus_file_inode_operations = {
.setattr = hfsplus_setattr,
+ .getattr = hfsplus_getattr,
.listxattr = hfsplus_listxattr,
};
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 586726a590d8..4f15665f0ad1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -2121,7 +2121,7 @@ int __init nfs_init_writepagecache(void)
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
- nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
if (nfs_congestion_kb > 256*1024)
nfs_congestion_kb = 256*1024;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index e2fe0e9ce0df..da52b594362a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -99,7 +99,7 @@ static unsigned int
nfsd_cache_size_limit(void)
{
unsigned int limit;
- unsigned long low_pages = totalram_pages - totalhigh_pages;
+ unsigned long low_pages = totalram_pages() - totalhigh_pages();
limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
return min_t(unsigned int, limit, 256*1024);
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index ab172e5f51d9..5becc8acc8f4 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
/* return (void *)__get_free_page(gfp_mask); */
}
- if (likely((size >> PAGE_SHIFT) < totalram_pages))
+ if (likely((size >> PAGE_SHIFT) < totalram_pages()))
return __vmalloc(size, gfp_mask, PAGE_KERNEL);
return NULL;
}
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 99ee093182cb..cc9b32b9db7c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)
obj-$(CONFIG_OCFS2_FS) += \
ocfs2.o \
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9b2ed62dd638..f3c20b279eb2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -582,9 +582,10 @@ bail:
}
static int o2hb_read_slots(struct o2hb_region *reg,
+ unsigned int begin_slot,
unsigned int max_slots)
{
- unsigned int current_slot=0;
+ unsigned int current_slot = begin_slot;
int status;
struct o2hb_bio_wait_ctxt wc;
struct bio *bio;
@@ -1093,9 +1094,14 @@ static int o2hb_highest_node(unsigned long *nodes, int numbits)
return find_last_bit(nodes, numbits);
}
+static int o2hb_lowest_node(unsigned long *nodes, int numbits)
+{
+ return find_first_bit(nodes, numbits);
+}
+
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node;
+ int i, ret, highest_node, lowest_node;
int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -1120,7 +1126,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
}
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
- if (highest_node >= O2NM_MAX_NODES) {
+ lowest_node = o2hb_lowest_node(configured_nodes, O2NM_MAX_NODES);
+ if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) {
mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
ret = -EINVAL;
goto bail;
@@ -1130,7 +1137,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* yet. Of course, if the node definitions have holes in them
* then we're reading an empty slot anyway... Consider this
* best-effort. */
- ret = o2hb_read_slots(reg, highest_node + 1);
+ ret = o2hb_read_slots(reg, lowest_node, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
goto bail;
@@ -1801,7 +1808,7 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
- ret = o2hb_read_slots(reg, reg->hr_blocks);
+ ret = o2hb_read_slots(reg, 0, reg->hr_blocks);
if (ret)
goto out;
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index bd1aab1f49a4..ef2854422a6e 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)/..
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index eed3db8c5b49..33431a0296a3 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)/..
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 642e471a6472..b03dd46237ce 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -179,7 +179,7 @@ bail:
static int dlmfs_file_release(struct inode *inode,
struct file *file)
{
- int level, status;
+ int level;
struct dlmfs_inode_private *ip = DLMFS_I(inode);
struct dlmfs_filp_private *fp = file->private_data;
@@ -188,7 +188,6 @@ static int dlmfs_file_release(struct inode *inode,
mlog(0, "close called on inode %lu\n", inode->i_ino);
- status = 0;
if (fp) {
level = fp->fp_lock_level;
if (level != DLM_LOCK_IV)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7642b6712c39..308f05be107c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -835,7 +835,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
u32 *numbits,
struct ocfs2_alloc_reservation *resv)
{
- int numfound = 0, bitoff, left, startoff, lastzero;
+ int numfound = 0, bitoff, left, startoff;
int local_resv = 0;
struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
@@ -873,7 +873,6 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
- lastzero = -1;
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
if (bitoff == left) {
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 3f1685d7d43b..1565dd8e8856 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -157,18 +157,14 @@ out:
}
/*
- * lock allocators, and reserving appropriate number of bits for
- * meta blocks and data clusters.
- *
- * in some cases, we don't need to reserve clusters, just let data_ac
- * be NULL.
+ * lock allocator, and reserve appropriate number of bits for
+ * meta blocks.
*/
-static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters_to_move,
u32 extents_to_split,
struct ocfs2_alloc_context **meta_ac,
- struct ocfs2_alloc_context **data_ac,
int extra_blocks,
int *credits)
{
@@ -193,13 +189,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
goto out;
}
- if (data_ac) {
- ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
@@ -259,10 +248,10 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
}
}
- ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
- &context->meta_ac,
- &context->data_ac,
- extra_blocks, &credits);
+ ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+ *len, 1,
+ &context->meta_ac,
+ extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -285,6 +274,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
}
}
+ /*
+ * Make sure ocfs2_reserve_cluster is called after
+ * __ocfs2_flush_truncate_log, otherwise, dead lock may happen.
+ *
+ * If ocfs2_reserve_cluster is called
+ * before __ocfs2_flush_truncate_log, dead lock on global bitmap
+ * may happen.
+ *
+ */
+ ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -617,9 +621,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
}
}
- ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
- &context->meta_ac,
- NULL, extra_blocks, &credits);
+ ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+ len, 1,
+ &context->meta_ac,
+ extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ce3465479447..58a8dc3fd6c6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -530,7 +530,7 @@ static const struct file_operations proc_lstats_operations = {
static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- unsigned long totalpages = totalram_pages + total_swap_pages;
+ unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
points = oom_badness(task, NULL, NULL, totalpages) *
@@ -2356,10 +2356,13 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
return -ESRCH;
if (p != current) {
- if (!capable(CAP_SYS_NICE)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
count = -EPERM;
goto out;
}
+ rcu_read_unlock();
err = security_task_setscheduler(p);
if (err) {
@@ -2392,11 +2395,14 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
return -ESRCH;
if (p != current) {
-
- if (!capable(CAP_SYS_NICE)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
err = -EPERM;
goto out;
}
+ rcu_read_unlock();
+
err = security_task_getscheduler(p);
if (err)
goto out;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47c3764c469b..39e96a21366e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -653,13 +653,25 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
};
+ unsigned long flags = vma->vm_flags;
size_t i;
+ /*
+ * Disabling thp is possible through both MADV_NOHUGEPAGE and
+ * PR_SET_THP_DISABLE. Both historically used VM_NOHUGEPAGE. Since
+ * the introduction of MMF_DISABLE_THP, however, userspace needs the
+ * ability to detect vmas where thp is not eligible in the same manner.
+ */
+ if (vma->vm_mm && test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) {
+ flags &= ~VM_HUGEPAGE;
+ flags |= VM_NOHUGEPAGE;
+ }
+
seq_puts(m, "VmFlags: ");
for (i = 0; i < BITS_PER_LONG; i++) {
if (!mnemonics[i][0])
continue;
- if (vma->vm_flags & (1UL << i)) {
+ if (flags & (1UL << i)) {
seq_putc(m, mnemonics[i][0]);
seq_putc(m, mnemonics[i][1]);
seq_putc(m, ' ');
diff --git a/fs/proc/util.c b/fs/proc/util.c
index b161cfa0f9fa..98f8adc17345 100644
--- a/fs/proc/util.c
+++ b/fs/proc/util.c
@@ -1,4 +1,5 @@
#include <linux/dcache.h>
+#include "internal.h"
unsigned name_to_int(const struct qstr *qstr)
{
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 681881dc8a9d..11ce3379abbb 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -53,7 +53,7 @@ struct userfaultfd_ctx {
/* a refile sequence protected by fault_pending_wqh lock */
struct seqcount refile_seq;
/* pseudo fd refcounting */
- atomic_t refcount;
+ refcount_t refcount;
/* userfaultfd syscall flags */
unsigned int flags;
/* features requested from the userspace */
@@ -140,8 +140,7 @@ out:
*/
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
{
- if (!atomic_inc_not_zero(&ctx->refcount))
- BUG();
+ refcount_inc(&ctx->refcount);
}
/**
@@ -154,7 +153,7 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
*/
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
- if (atomic_dec_and_test(&ctx->refcount)) {
+ if (refcount_dec_and_test(&ctx->refcount)) {
VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
@@ -686,7 +685,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
return -ENOMEM;
}
- atomic_set(&ctx->refcount, 1);
+ refcount_set(&ctx->refcount, 1);
ctx->flags = octx->flags;
ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
@@ -1911,7 +1910,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
if (!ctx)
return -ENOMEM;
- atomic_set(&ctx->refcount, 1);
+ refcount_set(&ctx->refcount, 1);
ctx->flags = flags;
ctx->features = 0;
ctx->state = UFFD_STATE_WAIT_API;