summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 12:25:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 12:25:03 -0700
commitc03098d4b9ad76bca2966a8769dcfe59f7f85103 (patch)
treee7e2a6a0a84ad29baa14c018e3d4dcb12bd08fd6 /mm
parentab2e7f4b46bf8fccf088ec496b3bb26b43e91340 (diff)
parentb01b2d72da25c000aeb124bc78daf3fb998be2b6 (diff)
downloadlinux-next-c03098d4b9ad76bca2966a8769dcfe59f7f85103.tar.gz
Merge tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2
Pull gfs2 mmap + page fault deadlocks fixes from Andreas Gruenbacher: "Functions gfs2_file_read_iter and gfs2_file_write_iter are both accessing the user buffer to write to or read from while holding the inode glock. In the most basic deadlock scenario, that buffer will not be resident and it will be mapped to the same file. Accessing the buffer will trigger a page fault, and gfs2 will deadlock trying to take the same inode glock again while trying to handle that fault. Fix that and similar, more complex scenarios by disabling page faults while accessing user buffers. To make this work, introduce a small amount of new infrastructure and fix some bugs that didn't trigger so far, with page faults enabled" * tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2: gfs2: Fix mmap + page fault deadlocks for direct I/O iov_iter: Introduce nofault flag to disable page faults gup: Introduce FOLL_NOFAULT flag to disable page faults iomap: Add done_before argument to iomap_dio_rw iomap: Support partial direct I/O on user copy failures iomap: Fix iomap_dio_rw return value for user copies gfs2: Fix mmap + page fault deadlocks for buffered I/O gfs2: Eliminate ip->i_gh gfs2: Move the inode glock locking to gfs2_file_buffered_write gfs2: Introduce flag for glock holder auto-demotion gfs2: Clean up function may_grant gfs2: Add wrapper for iomap_file_buffered_write iov_iter: Introduce fault_in_iov_iter_writeable iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} powerpc/kvm: Fix kvm_use_magic_page iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/gup.c139
2 files changed, 140 insertions, 3 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 5e206a429b57..bfcef6ff7a27 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -89,7 +89,7 @@
* ->lock_page (filemap_fault, access_process_vm)
*
* ->i_rwsem (generic_perform_write)
- * ->mmap_lock (fault_in_pages_readable->do_page_fault)
+ * ->mmap_lock (fault_in_readable->do_page_fault)
*
* bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
@@ -3733,7 +3733,7 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/mm/gup.c b/mm/gup.c
index 886d6148d3d0..e1c7e4bde11f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -918,6 +918,8 @@ static int faultin_page(struct vm_area_struct *vma,
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
+ if (*flags & FOLL_NOFAULT)
+ return -EFAULT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
@@ -1657,6 +1659,141 @@ finish_or_fault:
#endif /* !CONFIG_MMU */
/**
+ * fault_in_writeable - fault in userspace address range for writing
+ * @uaddr: start of address range
+ * @size: size of address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_writeable(char __user *uaddr, size_t size)
+{
+ char __user *start = uaddr, *end;
+
+ if (unlikely(size == 0))
+ return 0;
+ if (!PAGE_ALIGNED(uaddr)) {
+ if (unlikely(__put_user(0, uaddr) != 0))
+ return size;
+ uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
+ }
+ end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
+ if (unlikely(end < start))
+ end = NULL;
+ while (uaddr != end) {
+ if (unlikely(__put_user(0, uaddr) != 0))
+ goto out;
+ uaddr += PAGE_SIZE;
+ }
+
+out:
+ if (size > uaddr - start)
+ return size - (uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_writeable);
+
+/*
+ * fault_in_safe_writeable - fault in an address range for writing
+ * @uaddr: start of address range
+ * @size: length of address range
+ *
+ * Faults in an address range using get_user_pages, i.e., without triggering
+ * hardware page faults. This is primarily useful when we already know that
+ * some or all of the pages in the address range aren't in memory.
+ *
+ * Other than fault_in_writeable(), this function is non-destructive.
+ *
+ * Note that we don't pin or otherwise hold the pages referenced that we fault
+ * in. There's no guarantee that they'll stay in memory for any duration of
+ * time.
+ *
+ * Returns the number of bytes not faulted in, like copy_to_user() and
+ * copy_from_user().
+ */
+size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
+{
+ unsigned long start = (unsigned long)untagged_addr(uaddr);
+ unsigned long end, nstart, nend;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+
+ nstart = start & PAGE_MASK;
+ end = PAGE_ALIGN(start + size);
+ if (end < nstart)
+ end = 0;
+ for (; nstart != end; nstart = nend) {
+ unsigned long nr_pages;
+ long ret;
+
+ if (!locked) {
+ locked = 1;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ nend = end ? min(end, vma->vm_end) : vma->vm_end;
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ nr_pages = (nend - nstart) / PAGE_SIZE;
+ ret = __get_user_pages_locked(mm, nstart, nr_pages,
+ NULL, NULL, &locked,
+ FOLL_TOUCH | FOLL_WRITE);
+ if (ret <= 0)
+ break;
+ nend = nstart + ret * PAGE_SIZE;
+ }
+ if (locked)
+ mmap_read_unlock(mm);
+ if (nstart == end)
+ return 0;
+ return size - min_t(size_t, nstart - start, size);
+}
+EXPORT_SYMBOL(fault_in_safe_writeable);
+
+/**
+ * fault_in_readable - fault in userspace address range for reading
+ * @uaddr: start of user address range
+ * @size: size of user address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_readable(const char __user *uaddr, size_t size)
+{
+ const char __user *start = uaddr, *end;
+ volatile char c;
+
+ if (unlikely(size == 0))
+ return 0;
+ if (!PAGE_ALIGNED(uaddr)) {
+ if (unlikely(__get_user(c, uaddr) != 0))
+ return size;
+ uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
+ }
+ end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
+ if (unlikely(end < start))
+ end = NULL;
+ while (uaddr != end) {
+ if (unlikely(__get_user(c, uaddr) != 0))
+ goto out;
+ uaddr += PAGE_SIZE;
+ }
+
+out:
+ (void)c;
+ if (size > uaddr - start)
+ return size - (uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_readable);
+
+/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
@@ -2708,7 +2845,7 @@ static int internal_get_user_pages_fast(unsigned long start,
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
- FOLL_FAST_ONLY)))
+ FOLL_FAST_ONLY | FOLL_NOFAULT)))
return -EINVAL;
if (gup_flags & FOLL_PIN)