diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-11-02 12:25:03 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-11-02 12:25:03 -0700 |
commit | c03098d4b9ad76bca2966a8769dcfe59f7f85103 (patch) | |
tree | e7e2a6a0a84ad29baa14c018e3d4dcb12bd08fd6 /mm | |
parent | ab2e7f4b46bf8fccf088ec496b3bb26b43e91340 (diff) | |
parent | b01b2d72da25c000aeb124bc78daf3fb998be2b6 (diff) | |
download | linux-next-c03098d4b9ad76bca2966a8769dcfe59f7f85103.tar.gz |
Merge tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2
Pull gfs2 mmap + page fault deadlocks fixes from Andreas Gruenbacher:
"Functions gfs2_file_read_iter and gfs2_file_write_iter are both
accessing the user buffer to write to or read from while holding the
inode glock.
In the most basic deadlock scenario, that buffer will not be resident
and it will be mapped to the same file. Accessing the buffer will
trigger a page fault, and gfs2 will deadlock trying to take the same
inode glock again while trying to handle that fault.
Fix that and similar, more complex scenarios by disabling page faults
while accessing user buffers. To make this work, introduce a small
amount of new infrastructure and fix some bugs that didn't trigger so
far, with page faults enabled"
* tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2:
gfs2: Fix mmap + page fault deadlocks for direct I/O
iov_iter: Introduce nofault flag to disable page faults
gup: Introduce FOLL_NOFAULT flag to disable page faults
iomap: Add done_before argument to iomap_dio_rw
iomap: Support partial direct I/O on user copy failures
iomap: Fix iomap_dio_rw return value for user copies
gfs2: Fix mmap + page fault deadlocks for buffered I/O
gfs2: Eliminate ip->i_gh
gfs2: Move the inode glock locking to gfs2_file_buffered_write
gfs2: Introduce flag for glock holder auto-demotion
gfs2: Clean up function may_grant
gfs2: Add wrapper for iomap_file_buffered_write
iov_iter: Introduce fault_in_iov_iter_writeable
iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable
gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable}
powerpc/kvm: Fix kvm_use_magic_page
iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 4 | ||||
-rw-r--r-- | mm/gup.c | 139 |
2 files changed, 140 insertions, 3 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 5e206a429b57..bfcef6ff7a27 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -89,7 +89,7 @@ * ->lock_page (filemap_fault, access_process_vm) * * ->i_rwsem (generic_perform_write) - * ->mmap_lock (fault_in_pages_readable->do_page_fault) + * ->mmap_lock (fault_in_readable->do_page_fault) * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) @@ -3733,7 +3733,7 @@ again: * same page as we're writing to, without it being marked * up-to-date. */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes))) { status = -EFAULT; break; } @@ -918,6 +918,8 @@ static int faultin_page(struct vm_area_struct *vma, /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) return -ENOENT; + if (*flags & FOLL_NOFAULT) + return -EFAULT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) @@ -1657,6 +1659,141 @@ finish_or_fault: #endif /* !CONFIG_MMU */ /** + * fault_in_writeable - fault in userspace address range for writing + * @uaddr: start of address range + * @size: size of address range + * + * Returns the number of bytes not faulted in (like copy_to_user() and + * copy_from_user()). + */ +size_t fault_in_writeable(char __user *uaddr, size_t size) +{ + char __user *start = uaddr, *end; + + if (unlikely(size == 0)) + return 0; + if (!PAGE_ALIGNED(uaddr)) { + if (unlikely(__put_user(0, uaddr) != 0)) + return size; + uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); + } + end = (char __user *)PAGE_ALIGN((unsigned long)start + size); + if (unlikely(end < start)) + end = NULL; + while (uaddr != end) { + if (unlikely(__put_user(0, uaddr) != 0)) + goto out; + uaddr += PAGE_SIZE; + } + +out: + if (size > uaddr - start) + return size - (uaddr - start); + return 0; +} +EXPORT_SYMBOL(fault_in_writeable); + +/* + * fault_in_safe_writeable - fault in an address range for writing + * @uaddr: start of address range + * @size: length of address range + * + * Faults in an address range using get_user_pages, i.e., without triggering + * hardware page faults. This is primarily useful when we already know that + * some or all of the pages in the address range aren't in memory. + * + * Other than fault_in_writeable(), this function is non-destructive. + * + * Note that we don't pin or otherwise hold the pages referenced that we fault + * in. There's no guarantee that they'll stay in memory for any duration of + * time. + * + * Returns the number of bytes not faulted in, like copy_to_user() and + * copy_from_user(). + */ +size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) +{ + unsigned long start = (unsigned long)untagged_addr(uaddr); + unsigned long end, nstart, nend; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + int locked = 0; + + nstart = start & PAGE_MASK; + end = PAGE_ALIGN(start + size); + if (end < nstart) + end = 0; + for (; nstart != end; nstart = nend) { + unsigned long nr_pages; + long ret; + + if (!locked) { + locked = 1; + mmap_read_lock(mm); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + nend = end ? min(end, vma->vm_end) : vma->vm_end; + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + nr_pages = (nend - nstart) / PAGE_SIZE; + ret = __get_user_pages_locked(mm, nstart, nr_pages, + NULL, NULL, &locked, + FOLL_TOUCH | FOLL_WRITE); + if (ret <= 0) + break; + nend = nstart + ret * PAGE_SIZE; + } + if (locked) + mmap_read_unlock(mm); + if (nstart == end) + return 0; + return size - min_t(size_t, nstart - start, size); +} +EXPORT_SYMBOL(fault_in_safe_writeable); + +/** + * fault_in_readable - fault in userspace address range for reading + * @uaddr: start of user address range + * @size: size of user address range + * + * Returns the number of bytes not faulted in (like copy_to_user() and + * copy_from_user()). + */ +size_t fault_in_readable(const char __user *uaddr, size_t size) +{ + const char __user *start = uaddr, *end; + volatile char c; + + if (unlikely(size == 0)) + return 0; + if (!PAGE_ALIGNED(uaddr)) { + if (unlikely(__get_user(c, uaddr) != 0)) + return size; + uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); + } + end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); + if (unlikely(end < start)) + end = NULL; + while (uaddr != end) { + if (unlikely(__get_user(c, uaddr) != 0)) + goto out; + uaddr += PAGE_SIZE; + } + +out: + (void)c; + if (size > uaddr - start) + return size - (uaddr - start); + return 0; +} +EXPORT_SYMBOL(fault_in_readable); + +/** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address * @@ -2708,7 +2845,7 @@ static int internal_get_user_pages_fast(unsigned long start, if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | - FOLL_FAST_ONLY))) + FOLL_FAST_ONLY | FOLL_NOFAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) |