summaryrefslogtreecommitdiff
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c138
1 files changed, 104 insertions, 34 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e943370107d0..d398f6bf6d74 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swapops.h>
+#include <linux/miscdevice.h>
int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -413,13 +414,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
- if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
- ctx->flags & UFFD_USER_MODE_ONLY) {
- printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
- "sysctl knob to 1 if kernel faults must be handled "
- "without obtaining CAP_SYS_PTRACE capability\n");
+ if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
goto out;
- }
/*
* If it's already released don't get it. This avoids to loop
@@ -613,14 +609,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
if (release_new_ctx) {
struct vm_area_struct *vma;
struct mm_struct *mm = release_new_ctx->mm;
+ VMA_ITERATOR(vmi, mm, 0);
/* the various vma->vm_userfaultfd_ctx still points to it */
mmap_write_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~__VM_UFFD_FLAGS;
}
+ }
mmap_write_unlock(mm);
userfaultfd_ctx_put(release_new_ctx);
@@ -801,11 +799,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
return false;
}
-int userfaultfd_unmap_prep(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *unmaps)
+int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *unmaps)
{
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+ VMA_ITERATOR(vmi, mm, start);
+ struct vm_area_struct *vma;
+
+ for_each_vma_range(vmi, vma, end) {
struct userfaultfd_unmap_ctx *unmap_ctx;
struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
@@ -855,6 +855,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
WRITE_ONCE(ctx->released, true);
@@ -871,7 +872,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
*/
mmap_write_lock(mm);
prev = NULL;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
cond_resched();
BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
!!(vma->vm_flags & __VM_UFFD_FLAGS));
@@ -885,10 +886,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev)
+ if (prev) {
+ mas_pause(&mas);
vma = prev;
- else
+ } else {
prev = vma;
+ }
+
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
@@ -1270,6 +1274,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool found;
bool basic_ioctls;
unsigned long start, end, vma_end;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1312,7 +1317,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- vma = find_vma_prev(mm, start, &prev);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma)
goto out_unlock;
@@ -1337,7 +1343,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
*/
found = false;
basic_ioctls = false;
- for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
cond_resched();
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
@@ -1397,8 +1403,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
}
BUG_ON(!found);
- if (vma->vm_start < start)
- prev = vma;
+ mas_set(&mas, start);
+ prev = mas_prev(&mas, 0);
+ if (prev != vma)
+ mas_next(&mas, ULONG_MAX);
ret = 0;
do {
@@ -1428,6 +1436,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
((struct vm_userfaultfd_ctx){ ctx }),
anon_vma_name(vma));
if (prev) {
+ /* vma_merge() invalidated the mas */
+ mas_pause(&mas);
vma = prev;
goto next;
}
@@ -1435,11 +1445,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
ret = split_vma(mm, vma, start, 1);
if (ret)
break;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
if (vma->vm_end > end) {
ret = split_vma(mm, vma, end, 0);
if (ret)
break;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
next:
/*
@@ -1456,8 +1470,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
skip:
prev = vma;
start = vma->vm_end;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
+ vma = mas_next(&mas, end - 1);
+ } while (vma);
out_unlock:
mmap_write_unlock(mm);
mmput(mm);
@@ -1501,6 +1515,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
bool found;
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1519,7 +1534,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- vma = find_vma_prev(mm, start, &prev);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma)
goto out_unlock;
@@ -1544,7 +1560,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
*/
found = false;
ret = -EINVAL;
- for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
cond_resched();
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
@@ -1564,8 +1580,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
}
BUG_ON(!found);
- if (vma->vm_start < start)
- prev = vma;
+ mas_set(&mas, start);
+ prev = mas_prev(&mas, 0);
+ if (prev != vma)
+ mas_next(&mas, ULONG_MAX);
ret = 0;
do {
@@ -1630,8 +1648,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
skip:
prev = vma;
start = vma->vm_end;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
+ vma = mas_next(&mas, end - 1);
+ } while (vma);
out_unlock:
mmap_write_unlock(mm);
mmput(mm);
@@ -2052,19 +2070,33 @@ static void init_once_userfaultfd_ctx(void *mem)
seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}
-SYSCALL_DEFINE1(userfaultfd, int, flags)
+static inline bool userfaultfd_allowed(bool is_syscall, int flags)
+{
+ bool kernel_faults = !(flags & UFFD_USER_MODE_ONLY);
+ bool allow_unprivileged = sysctl_unprivileged_userfaultfd;
+
+ /* userfaultfd(2) access is controlled by sysctl + capability. */
+ if (is_syscall && kernel_faults) {
+ if (!allow_unprivileged && !capable(CAP_SYS_PTRACE))
+ return false;
+ }
+
+ /*
+ * For /dev/userfaultfd, access is to be controlled using e.g.
+ * permissions on the device node. We assume this is correctly
+ * configured by userspace, so we simply allow access here.
+ */
+
+ return true;
+}
+
+static int new_userfaultfd(bool is_syscall, int flags)
{
struct userfaultfd_ctx *ctx;
int fd;
- if (!sysctl_unprivileged_userfaultfd &&
- (flags & UFFD_USER_MODE_ONLY) == 0 &&
- !capable(CAP_SYS_PTRACE)) {
- printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
- "sysctl knob to 1 if kernel faults must be handled "
- "without obtaining CAP_SYS_PTRACE capability\n");
+ if (!userfaultfd_allowed(is_syscall, flags))
return -EPERM;
- }
BUG_ON(!current->mm);
@@ -2083,6 +2115,10 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
refcount_set(&ctx->refcount, 1);
ctx->flags = flags;
ctx->features = 0;
+ /*
+ * If UFFD_USER_MODE_ONLY is not set, then userfaultfd_allowed() above
+ * decided that kernel faults were allowed and should be handled.
+ */
ctx->released = false;
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
@@ -2098,8 +2134,42 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
return fd;
}
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ return new_userfaultfd(true, flags);
+}
+
+static int userfaultfd_dev_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
+{
+ if (cmd != USERFAULTFD_IOC_NEW)
+ return -EINVAL;
+
+ return new_userfaultfd(false, flags);
+}
+
+static const struct file_operations userfaultfd_dev_fops = {
+ .open = userfaultfd_dev_open,
+ .unlocked_ioctl = userfaultfd_dev_ioctl,
+ .compat_ioctl = userfaultfd_dev_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice userfaultfd_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "userfaultfd",
+ .fops = &userfaultfd_dev_fops
+};
+
static int __init userfaultfd_init(void)
{
+ WARN_ON(misc_register(&userfaultfd_misc));
+
userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
sizeof(struct userfaultfd_ctx),
0,