summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c20
-rw-r--r--mm/frontswap.c8
-rw-r--r--mm/gup.c118
-rw-r--r--mm/hmm.c3
-rw-r--r--mm/hugetlb.c35
-rw-r--r--mm/internal.h8
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/list_lru.c2
-rw-r--r--mm/madvise.c190
-rw-r--r--mm/memory-failure.c15
-rw-r--r--mm/memory.c73
-rw-r--r--mm/memory_hotplug.c42
-rw-r--r--mm/mempolicy.c29
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c51
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/page_counter.c13
-rw-r--r--mm/page_io.c8
-rw-r--r--mm/page_isolation.c5
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/swap.c5
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c31
25 files changed, 437 insertions, 242 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 9d43acdba8aa..cf4bdf7803b6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2425,6 +2425,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
struct file *fpin = NULL;
pgoff_t offset = vmf->pgoff;
+ unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
@@ -2440,14 +2441,15 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
}
/* Avoid banging the cache line if not needed */
- if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
- ra->mmap_miss++;
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss < MMAP_LOTSAMISS * 10)
+ WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
- if (ra->mmap_miss > MMAP_LOTSAMISS)
+ if (mmap_miss > MMAP_LOTSAMISS)
return fpin;
/*
@@ -2473,13 +2475,15 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
struct file *fpin = NULL;
+ unsigned int mmap_miss;
pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
- if (ra->mmap_miss > 0)
- ra->mmap_miss--;
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
if (PageReadahead(page)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
@@ -2645,6 +2649,7 @@ void filemap_map_pages(struct vm_fault *vmf,
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *page;
+ unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
@@ -2681,8 +2686,8 @@ void filemap_map_pages(struct vm_fault *vmf,
if (page->index >= max_idx)
goto unlock;
- if (file->f_ra.mmap_miss > 0)
- file->f_ra.mmap_miss--;
+ if (mmap_miss > 0)
+ mmap_miss--;
vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
@@ -2702,6 +2707,7 @@ next:
break;
}
rcu_read_unlock();
+ WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
}
EXPORT_SYMBOL(filemap_map_pages);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 9d977b1fc016..2183a56c7874 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -61,16 +61,16 @@ static u64 frontswap_failed_stores;
static u64 frontswap_invalidates;
static inline void inc_frontswap_loads(void) {
- frontswap_loads++;
+ data_race(frontswap_loads++);
}
static inline void inc_frontswap_succ_stores(void) {
- frontswap_succ_stores++;
+ data_race(frontswap_succ_stores++);
}
static inline void inc_frontswap_failed_stores(void) {
- frontswap_failed_stores++;
+ data_race(frontswap_failed_stores++);
}
static inline void inc_frontswap_invalidates(void) {
- frontswap_invalidates++;
+ data_race(frontswap_invalidates++);
}
#else
static inline void inc_frontswap_loads(void) { }
diff --git a/mm/gup.c b/mm/gup.c
index 5daadae475ea..2cc5eba44362 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -859,7 +859,7 @@ unmap:
* does not include FOLL_NOWAIT, the mmap_lock may be released. If it
* is, *@locked will be set to 0 and -EBUSY returned.
*/
-static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+static int faultin_page(struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *locked)
{
unsigned int fault_flags = 0;
@@ -884,7 +884,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_TRIED;
}
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -893,13 +893,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
BUG();
}
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
-
if (ret & VM_FAULT_RETRY) {
if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*locked = 0;
@@ -969,7 +962,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
/**
* __get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -1028,7 +1020,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
-static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1110,8 +1102,7 @@ retry:
page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
- ret = faultin_page(tsk, vma, start, &foll_flags,
- locked);
+ ret = faultin_page(vma, start, &foll_flags, locked);
switch (ret) {
case 0:
goto retry;
@@ -1185,8 +1176,6 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
/**
* fixup_user_fault() - manually resolve a user page fault
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
@@ -1214,7 +1203,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
* This function will not return with an unlocked mmap_lock. So it has not the
* same semantics wrt the @mm->mmap_lock as does filemap_fault().
*/
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked)
{
@@ -1238,7 +1227,7 @@ retry:
fatal_signal_pending(current))
return -EINTR;
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1255,12 +1244,6 @@ retry:
goto retry;
}
- if (tsk) {
- if (major)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);
@@ -1269,8 +1252,7 @@ EXPORT_SYMBOL_GPL(fixup_user_fault);
* Please note that this function, unlike __get_user_pages will not
* return 0 for nr_pages > 0 without FOLL_NOWAIT
*/
-static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1303,7 +1285,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
pages_done = 0;
lock_dropped = false;
for (;;) {
- ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+ ret = __get_user_pages(mm, start, nr_pages, flags, pages,
vmas, locked);
if (!locked)
/* VM_FAULT_RETRY couldn't trigger, bypass */
@@ -1363,7 +1345,7 @@ retry:
}
*locked = 1;
- ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+ ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
pages, NULL, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
@@ -1449,7 +1431,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
}
@@ -1533,7 +1515,7 @@ struct page *get_dump_page(unsigned long addr)
struct vm_area_struct *vma;
struct page *page;
- if (__get_user_pages(current, current->mm, addr, 1,
+ if (__get_user_pages(current->mm, addr, 1,
FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
NULL) < 1)
return NULL;
@@ -1542,8 +1524,7 @@ struct page *get_dump_page(unsigned long addr)
}
#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
-static long __get_user_pages_locked(struct task_struct *tsk,
- struct mm_struct *mm, unsigned long start,
+static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
struct vm_area_struct **vmas, int *locked,
unsigned int foll_flags)
@@ -1659,8 +1640,7 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private)
return __alloc_pages_node(nid, gfp_mask, 0);
}
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
- struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1734,7 +1714,7 @@ check_again:
* again migrating any new CMA pages which we failed to isolate
* earlier.
*/
- ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ ret = __get_user_pages_locked(mm, start, nr_pages,
pages, vmas, NULL,
gup_flags);
@@ -1748,8 +1728,7 @@ check_again:
return ret;
}
#else
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
- struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1764,8 +1743,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
* allows us to process the FOLL_LONGTERM flag.
*/
-static long __gup_longterm_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1790,7 +1768,7 @@ static long __gup_longterm_locked(struct task_struct *tsk,
flags = memalloc_nocma_save();
}
- rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages,
vmas_tmp, NULL, gup_flags);
if (gup_flags & FOLL_LONGTERM) {
@@ -1805,7 +1783,7 @@ static long __gup_longterm_locked(struct task_struct *tsk,
goto out;
}
- rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+ rc = check_and_migrate_cma_pages(mm, start, rc, pages,
vmas_tmp, gup_flags);
}
@@ -1815,22 +1793,20 @@ out:
return rc;
}
#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
-static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
unsigned int flags)
{
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
NULL, flags);
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
#ifdef CONFIG_MMU
-static long __get_user_pages_remote(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1849,20 +1825,18 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* This will check the vmas (even if our vmas arg is NULL)
* and return -ENOTSUPP if DAX isn't allowed in this case:
*/
- return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
vmas, gup_flags | FOLL_TOUCH |
FOLL_REMOTE);
}
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
/**
* get_user_pages_remote() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -1921,7 +1895,7 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* should use get_user_pages_remote because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1933,13 +1907,13 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(get_user_pages_remote);
#else /* CONFIG_MMU */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1947,8 +1921,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
return 0;
}
-static long __get_user_pages_remote(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1968,11 +1941,10 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
+ * This is the same as get_user_pages_remote(), just with a less-flexible
+ * calling convention where we assume that the mm being operated on belongs to
+ * the current task, and doesn't allow passing of a locked parameter. We also
+ * obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -1985,7 +1957,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1995,7 +1967,7 @@ EXPORT_SYMBOL(get_user_pages);
*
* mmap_read_lock(mm);
* do_something()
- * get_user_pages(tsk, mm, ..., pages, NULL);
+ * get_user_pages(mm, ..., pages, NULL);
* mmap_read_unlock(mm);
*
* to:
@@ -2003,7 +1975,7 @@ EXPORT_SYMBOL(get_user_pages);
* int locked = 1;
* mmap_read_lock(mm);
* do_something()
- * get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ * get_user_pages_locked(mm, ..., pages, &locked);
* if (locked)
* mmap_read_unlock(mm);
*
@@ -2041,7 +2013,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
@@ -2051,12 +2023,12 @@ EXPORT_SYMBOL(get_user_pages_locked);
* get_user_pages_unlocked() is suitable to replace the form:
*
* mmap_read_lock(mm);
- * get_user_pages(tsk, mm, ..., pages, NULL);
+ * get_user_pages(mm, ..., pages, NULL);
* mmap_read_unlock(mm);
*
* with:
*
- * get_user_pages_unlocked(tsk, mm, ..., pages);
+ * get_user_pages_unlocked(mm, ..., pages);
*
* It is functionally equivalent to get_user_pages_fast so
* get_user_pages_fast should be used instead if specific gup_flags
@@ -2079,7 +2051,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
return -EINVAL;
mmap_read_lock(mm);
- ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
if (locked)
mmap_read_unlock(mm);
@@ -2724,7 +2696,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
*/
if (gup_flags & FOLL_LONGTERM) {
mmap_read_lock(current->mm);
- ret = __gup_longterm_locked(current, current->mm,
+ ret = __gup_longterm_locked(current->mm,
start, nr_pages,
pages, NULL, gup_flags);
mmap_read_unlock(current->mm);
@@ -2967,10 +2939,8 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
/**
- * pin_user_pages_remote() - pin pages of a remote process (task != current)
+ * pin_user_pages_remote() - pin pages of a remote process
*
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -2991,7 +2961,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
*/
-long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -3001,7 +2971,7 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(pin_user_pages_remote);
@@ -3033,7 +3003,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
@@ -3078,7 +3048,7 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
diff --git a/mm/hmm.c b/mm/hmm.c
index 0809baee49d0..2f9c90a6b3b3 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -75,7 +75,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
}
for (; addr < end; addr += PAGE_SIZE)
- if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
+ if (handle_mm_fault(vma, addr, fault_flags, NULL) &
+ VM_FAULT_ERROR)
return -EFAULT;
return -EBUSY;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0da89305a555..589c330df4db 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1092,15 +1092,6 @@ retry_cpuset:
return NULL;
}
-/* Movability of hugepages depends on migration support. */
-static inline gfp_t htlb_alloc_mask(struct hstate *h)
-{
- if (hugepage_movable_supported(h))
- return GFP_HIGHUSER_MOVABLE;
- else
- return GFP_HIGHUSER;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
@@ -2022,31 +2013,9 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
}
/* page migration callback function */
-struct page *alloc_huge_page_node(struct hstate *h, int nid)
-{
- gfp_t gfp_mask = htlb_alloc_mask(h);
- struct page *page = NULL;
-
- if (nid != NUMA_NO_NODE)
- gfp_mask |= __GFP_THISNODE;
-
- spin_lock(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
- spin_unlock(&hugetlb_lock);
-
- if (!page)
- page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
-
- return page;
-}
-
-/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask)
+ nodemask_t *nmask, gfp_t gfp_mask)
{
- gfp_t gfp_mask = htlb_alloc_mask(h);
-
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
@@ -2074,7 +2043,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return page;
diff --git a/mm/internal.h b/mm/internal.h
index dd14c5311d7f..10c677655912 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -613,5 +613,11 @@ static inline bool is_migrate_highatomic_page(struct page *page)
}
void setup_zone_pageset(struct zone *zone);
-extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+
+struct migration_target_control {
+ int nid; /* preferred node id */
+ nodemask_t *nmask;
+ gfp_t gfp_mask;
+};
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 29a684a4656a..4e644dc074b1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1169,8 +1169,10 @@ static bool update_checksum(struct kmemleak_object *object)
u32 old_csum = object->checksum;
kasan_disable_current();
+ kcsan_disable_current();
object->checksum = crc32(0, (void *)object->pointer, object->size);
kasan_enable_current();
+ kcsan_enable_current();
return object->checksum != old_csum;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 5fb176d497ea..90a625b02a1d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -480,7 +480,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ NULL);
else
ret = VM_FAULT_WRITE;
put_page(page);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index e825804b3928..5aa6e44bc2ae 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -180,7 +180,7 @@ unsigned long list_lru_count_one(struct list_lru *lru,
rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
- count = l->nr_items;
+ count = READ_ONCE(l->nr_items);
rcu_read_unlock();
return count;
diff --git a/mm/madvise.c b/mm/madvise.c
index 1fe2e55c0a9b..7b5ca96108cd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,17 +17,20 @@
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/compat.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
#include <linux/sched/mm.h>
+#include <linux/uio.h>
#include <asm/tlb.h>
@@ -36,6 +39,7 @@
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
+ struct task_struct *target_task;
};
/*
@@ -255,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
+ struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
@@ -289,12 +294,12 @@ static long madvise_willneed(struct vm_area_struct *vma,
*/
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
get_file(file);
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return 0;
}
@@ -315,6 +320,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (fatal_signal_pending(current))
return -EINTR;
+ if (private->target_task &&
+ fatal_signal_pending(private->target_task))
+ return -EINTR;
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
pmd_t orig_pmd;
@@ -476,12 +485,14 @@ static const struct mm_walk_ops cold_walk_ops = {
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
+ struct task_struct *task,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct madvise_walk_private walk_private = {
.pageout = false,
.tlb = tlb,
+ .target_task = task,
};
tlb_start_vma(tlb, vma);
@@ -489,7 +500,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}
-static long madvise_cold(struct vm_area_struct *vma,
+static long madvise_cold(struct task_struct *task,
+ struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
{
@@ -502,19 +514,21 @@ static long madvise_cold(struct vm_area_struct *vma,
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
- madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, task, vma, start_addr, end_addr);
tlb_finish_mmu(&tlb, start_addr, end_addr);
return 0;
}
static void madvise_pageout_page_range(struct mmu_gather *tlb,
+ struct task_struct *task,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct madvise_walk_private walk_private = {
.pageout = true,
.tlb = tlb,
+ .target_task = task,
};
tlb_start_vma(tlb, vma);
@@ -538,7 +552,8 @@ static inline bool can_do_pageout(struct vm_area_struct *vma)
inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
}
-static long madvise_pageout(struct vm_area_struct *vma,
+static long madvise_pageout(struct task_struct *task,
+ struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
{
@@ -554,7 +569,7 @@ static long madvise_pageout(struct vm_area_struct *vma,
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
- madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ madvise_pageout_page_range(&tlb, task, vma, start_addr, end_addr);
tlb_finish_mmu(&tlb, start_addr, end_addr);
return 0;
@@ -683,7 +698,6 @@ out:
if (nr_swap) {
if (current->mm == mm)
sync_mm_rss(mm);
-
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
}
arch_leave_lazy_mmu_mode();
@@ -763,6 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int behavior)
{
+ struct mm_struct *mm = vma->vm_mm;
+
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -770,8 +786,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
- mmap_read_lock(current->mm);
- vma = find_vma(current->mm, start);
+ mmap_read_lock(mm);
+ vma = find_vma(mm, start);
if (!vma)
return -ENOMEM;
if (start < vma->vm_start) {
@@ -825,6 +841,7 @@ static long madvise_remove(struct vm_area_struct *vma,
loff_t offset;
int error;
struct file *f;
+ struct mm_struct *mm = vma->vm_mm;
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
@@ -852,13 +869,13 @@ static long madvise_remove(struct vm_area_struct *vma,
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
/* mmap_lock was not released by userfaultfd_remove() */
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return error;
}
@@ -927,7 +944,8 @@ static int madvise_inject_error(int behavior,
#endif
static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+madvise_vma(struct task_struct *task, struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
switch (behavior) {
@@ -936,9 +954,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
+ return madvise_cold(task, vma, prev, start, end);
case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
+ return madvise_pageout(task, vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -985,6 +1003,18 @@ madvise_behavior_valid(int behavior)
}
}
+static bool
+process_madvise_behavior_valid(int behavior)
+{
+ switch (behavior) {
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* The madvise(2) system call.
*
@@ -1032,6 +1062,11 @@ madvise_behavior_valid(int behavior)
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ * MADV_COLD - the application is not expected to use this memory soon,
+ * deactivate pages in this range so that they can be reclaimed
+ * easily if memory pressure hanppens.
+ * MADV_PAGEOUT - the application is not expected to use this memory soon,
+ * page out the pages in this range immediately.
*
* return values:
* zero - success
@@ -1046,7 +1081,8 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-int do_madvise(unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct task_struct *target_task, struct mm_struct *mm,
+ unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@@ -1084,7 +1120,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
- if (mmap_write_lock_killable(current->mm))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
/*
@@ -1099,12 +1135,12 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* but for now we have the mmget_still_valid()
* model.
*/
- if (!mmget_still_valid(current->mm)) {
- mmap_write_unlock(current->mm);
+ if (!mmget_still_valid(mm)) {
+ mmap_write_unlock(mm);
return -EINTR;
}
} else {
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
}
/*
@@ -1112,7 +1148,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
- vma = find_vma_prev(current->mm, start, &prev);
+ vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
@@ -1137,7 +1173,8 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
tmp = end;
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = madvise_vma(vma, &prev, start, tmp, behavior);
+ error = madvise_vma(target_task, vma, &prev,
+ start, tmp, behavior);
if (error)
goto out;
start = tmp;
@@ -1149,19 +1186,122 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
- vma = find_vma(current->mm, start);
+ vma = find_vma(mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
else
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
- return do_madvise(start, len_in, behavior);
+ return do_madvise(current, current->mm, start, len_in, behavior);
+}
+
+static int process_madvise_vec(struct task_struct *target_task,
+ struct mm_struct *mm, struct iov_iter *iter, int behavior)
+{
+ struct iovec iovec;
+ int ret = 0;
+
+ while (iov_iter_count(iter)) {
+ iovec = iov_iter_iovec(iter);
+ ret = do_madvise(target_task, mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
+ if (ret < 0)
+ break;
+ iov_iter_advance(iter, iovec.iov_len);
+ }
+
+ return ret;
+}
+
+static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter,
+ int behavior, unsigned int flags)
+{
+ ssize_t ret;
+ struct pid *pid;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ size_t total_len = iov_iter_count(iter);
+
+ if (flags != 0)
+ return -EINVAL;
+
+ pid = pidfd_get_pid(pidfd);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ret = -ESRCH;
+ goto put_pid;
+ }
+
+ if (task->mm != current->mm &&
+ !process_madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_task;
+ }
+
+ mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
+ ret = process_madvise_vec(task, mm, iter, behavior);
+ if (ret >= 0)
+ ret = total_len - iov_iter_count(iter);
+
+ mmput(mm);
+release_task:
+ put_task_struct(task);
+put_pid:
+ put_pid(pid);
+ return ret;
+}
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+ unsigned long, vlen, int, behavior, unsigned int, flags)
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+
+ ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret >= 0) {
+ ret = do_process_madvise(pidfd, &iter, behavior, flags);
+ kfree(iov);
+ }
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE5(process_madvise, compat_int_t, pidfd,
+ const struct compat_iovec __user *, vec,
+ compat_ulong_t, vlen,
+ compat_int_t, behavior,
+ compat_uint_t, flags)
+
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+
+ ret = compat_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret >= 0) {
+ ret = do_process_madvise(pidfd, &iter, behavior, flags);
+ kfree(iov);
+ }
+ return ret;
}
+#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7359164c3fe9..9523515be997 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1659,13 +1659,6 @@ int unpoison_memory(unsigned long pfn)
}
EXPORT_SYMBOL(unpoison_memory);
-static struct page *new_page(struct page *p, unsigned long private)
-{
- int nid = page_to_nid(p);
-
- return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
-}
-
/*
* Safely get reference count of an arbitrary page.
* Returns 0 for a free page, -EIO for a zero refcount page
@@ -1770,6 +1763,10 @@ static int __soft_offline_page(struct page *page)
const char *msg_page[] = {"page", "hugepage"};
bool huge = PageHuge(page);
LIST_HEAD(pagelist);
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@@ -1806,8 +1803,8 @@ static int __soft_offline_page(struct page *page)
}
if (isolate_page(hpage, &pagelist)) {
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (!ret) {
page_handle_poison(page, true, true, huge);
} else {
diff --git a/mm/memory.c b/mm/memory.c
index 9ffbe8fa08e6..45e1dc0547ba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -71,6 +71,8 @@
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
#include <trace/events/kmem.h>
@@ -3127,8 +3129,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!page) {
struct swap_info_struct *si = swp_swap_info(entry);
- if (si->flags & SWP_SYNCHRONOUS_IO &&
- __swap_count(entry) == 1) {
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
+ __swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
@@ -4366,6 +4368,67 @@ retry_pud:
return handle_pte_fault(&vmf);
}
+/**
+ * mm_account_fault - Do page fault accountings
+ *
+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
+ * of perf event counters, but we'll still do the per-task accounting to
+ * the task who triggered this page fault.
+ * @address: the faulted address.
+ * @flags: the fault flags.
+ * @ret: the fault retcode.
+ *
+ * This will take care of most of the page fault accountings. Meanwhile, it
+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * still be in per-arch page fault handlers at the entry of page fault.
+ */
+static inline void mm_account_fault(struct pt_regs *regs,
+ unsigned long address, unsigned int flags,
+ vm_fault_t ret)
+{
+ bool major;
+
+ /*
+ * We don't do accounting for some specific faults:
+ *
+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
+ * includes arch_vma_access_permitted() failing before reaching here.
+ * So this is not a "this many hardware page faults" counter. We
+ * should use the hw profiling for that.
+ *
+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
+ * once they're completed.
+ */
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+ return;
+
+ /*
+ * We define the fault as a major fault when the final successful fault
+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
+ * handle it immediately previously).
+ */
+ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
+
+ if (major)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+
+ /*
+ * If the fault is done for GUP, regs will be NULL. We only do the
+ * accounting for the per thread fault counters who triggered the
+ * fault, and we skip the perf event updates.
+ */
+ if (!regs)
+ return;
+
+ if (major)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ else
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -4373,7 +4436,7 @@ retry_pud:
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+ unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
@@ -4414,6 +4477,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
+ mm_account_fault(regs, address, flags, ret);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -4687,7 +4752,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
- ret = get_user_pages_remote(tsk, mm, addr, 1,
+ ret = get_user_pages_remote(mm, addr, 1,
gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0204de3f9601..dcdf3271f87e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1275,23 +1275,6 @@ found:
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private)
-{
- int nid = page_to_nid(page);
- nodemask_t nmask = node_states[N_MEMORY];
-
- /*
- * try to allocate from a different node but reuse this node if there
- * are no other online nodes to be used (e.g. we are offlining a part
- * of the only existing node)
- */
- node_clear(nid, nmask);
- if (nodes_empty(nmask))
- node_set(nid, nmask);
-
- return new_page_nodemask(page, nid, &nmask);
-}
-
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1351,9 +1334,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
put_page(page);
}
if (!list_empty(&source)) {
- /* Allocate a new page from the nearest neighbor node */
- ret = migrate_pages(&source, new_node_page, NULL, 0,
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ /*
+ * We have checked that migration range is on a single zone so
+ * we can use the nid of the first page to all the others.
+ */
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+
+ /*
+ * try to allocate from a different node but reuse this node
+ * if there are no other online nodes to be used (e.g. we are
+ * offlining a part of the only existing node)
+ */
+ node_clear(mtc.nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(mtc.nid, nmask);
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
pr_warn("migrating pfn %lx failed ret:%d ",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dabcee8630f2..93fcfc1f2fa2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1065,27 +1065,6 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
return 0;
}
-/* page allocation callback for NUMA node migration */
-struct page *alloc_new_node_page(struct page *page, unsigned long node)
-{
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- else if (PageTransHuge(page)) {
- struct page *thp;
-
- thp = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- } else
- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE, 0);
-}
-
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
@@ -1096,6 +1075,10 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct migration_target_control mtc = {
+ .nid = dest,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
nodes_clear(nmask);
node_set(source, nmask);
@@ -1110,8 +1093,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 85efab3da720..79bff63ecf27 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -489,7 +489,7 @@ void mempool_free(void *element, mempool_t *pool)
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
*/
- if (unlikely(pool->curr_nr < pool->min_nr)) {
+ if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
diff --git a/mm/migrate.c b/mm/migrate.c
index de0f3662c605..c23378169762 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1533,6 +1533,49 @@ out:
return rc;
}
+struct page *alloc_migration_target(struct page *page, unsigned long private)
+{
+ struct migration_target_control *mtc;
+ gfp_t gfp_mask;
+ unsigned int order = 0;
+ struct page *new_page = NULL;
+ int nid;
+ int zidx;
+
+ mtc = (struct migration_target_control *)private;
+ gfp_mask = mtc->gfp_mask;
+ nid = mtc->nid;
+ if (nid == NUMA_NO_NODE)
+ nid = page_to_nid(page);
+
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(compound_head(page));
+
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ }
+
+ if (PageTransHuge(page)) {
+ /*
+ * clear __GFP_RECLAIM to make the migration callback
+ * consistent with regular THP allocations.
+ */
+ gfp_mask &= ~__GFP_RECLAIM;
+ gfp_mask |= GFP_TRANSHUGE;
+ order = HPAGE_PMD_ORDER;
+ }
+ zidx = zone_idx(page_zone(page));
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
+ gfp_mask |= __GFP_HIGHMEM;
+
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+
+ if (new_page && PageTransHuge(new_page))
+ prep_transhuge_page(new_page);
+
+ return new_page;
+}
+
#ifdef CONFIG_NUMA
static int store_status(int __user *status, int start, int value, int nr)
@@ -1550,9 +1593,13 @@ static int do_move_pages_to_node(struct mm_struct *mm,
struct list_head *pagelist, int node)
{
int err;
+ struct migration_target_control mtc = {
+ .nid = node,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(pagelist);
return err;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6f4056cf7384..b31f310c2545 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8349,6 +8349,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
+ struct migration_target_control mtc = {
+ .nid = zone_to_nid(cc->zone),
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
migrate_prep();
@@ -8375,8 +8379,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- NULL, 0, cc->mode, MR_CONTIG_RANGE);
+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
diff --git a/mm/page_counter.c b/mm/page_counter.c
index c56db2d5e159..e3a205275a0b 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -77,8 +77,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
*/
- if (new > c->watermark)
- c->watermark = new;
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
}
}
@@ -119,9 +119,10 @@ bool page_counter_try_charge(struct page_counter *counter,
propagate_protected_usage(counter, new);
/*
* This is racy, but we can live with some
- * inaccuracy in the failcnt.
+ * inaccuracy in the failcnt which is only used
+ * to report stats.
*/
- c->failcnt++;
+ data_race(c->failcnt++);
*fail = c;
goto failed;
}
@@ -130,8 +131,8 @@ bool page_counter_try_charge(struct page_counter *counter,
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
*/
- if (new > c->watermark)
- c->watermark = new;
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
}
return true;
diff --git a/mm/page_io.c b/mm/page_io.c
index eaaecb75a8c5..739cb5b458dc 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -85,7 +85,7 @@ static void swap_slot_free_notify(struct page *page)
return;
sis = page_swap_info(page);
- if (!(sis->flags & SWP_BLKDEV))
+ if (data_race(!(sis->flags & SWP_BLKDEV)))
return;
/*
@@ -312,7 +312,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (sis->flags & SWP_FS) {
+ if (data_race(sis->flags & SWP_FS)) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -403,7 +403,7 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- if (sis->flags & SWP_FS) {
+ if (data_race(sis->flags & SWP_FS)) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -465,7 +465,7 @@ int swap_set_page_dirty(struct page *page)
{
struct swap_info_struct *sis = page_swap_info(page);
- if (sis->flags & SWP_FS) {
+ if (data_race(sis->flags & SWP_FS)) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f6d07c5f0d34..242c03121d73 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -306,8 +306,3 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
return pfn < end_pfn ? -EBUSY : 0;
}
-
-struct page *alloc_migrate_target(struct page *page, unsigned long private)
-{
- return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
-}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index cc85ce81914a..29c052099aff 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -105,7 +105,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
* current/current->mm
*/
mmap_read_lock(mm);
- pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
+ pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
NULL, &locked);
if (locked)
diff --git a/mm/rmap.c b/mm/rmap.c
index c56fab5826c1..b970353c06ed 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -672,7 +672,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
void flush_tlb_batched_pending(struct mm_struct *mm)
{
- if (mm->tlb_flush_batched) {
+ if (data_race(mm->tlb_flush_batched)) {
flush_tlb_mm(mm);
/*
diff --git a/mm/swap.c b/mm/swap.c
index 28ab40041b47..587be74a68de 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -631,7 +631,8 @@ void lru_add_drain_cpu(int cpu)
__pagevec_lru_add(pvec);
pvec = &per_cpu(lru_rotate.pvec, cpu);
- if (pagevec_count(pvec)) {
+ /* Disabling interrupts below acts as a compiler barrier. */
+ if (data_race(pagevec_count(pvec))) {
unsigned long flags;
/* No harm done if a racing interrupt already did this */
@@ -792,7 +793,7 @@ void lru_add_drain_all(void)
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
- pagevec_count(&per_cpu(lru_rotate.pvec, cpu)) ||
+ data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1983be226b1c..66e750f361ed 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -57,8 +57,8 @@ static bool enable_vma_readahead __read_mostly = true;
#define GET_SWAP_RA_VAL(vma) \
(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
-#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
-#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
+#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++)
+#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr))
static struct {
unsigned long add_total;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7d22347264e6..7b0974f0fcc4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -672,7 +672,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
if (offset == si->lowest_bit)
si->lowest_bit += nr_entries;
if (end == si->highest_bit)
- si->highest_bit -= nr_entries;
+ WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
si->inuse_pages += nr_entries;
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
@@ -704,7 +704,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
if (end > si->highest_bit) {
bool was_full = !si->highest_bit;
- si->highest_bit = end;
+ WRITE_ONCE(si->highest_bit, end);
if (was_full && (si->flags & SWP_WRITEOK))
add_to_avail_list(si);
}
@@ -869,7 +869,7 @@ checks:
else
goto done;
}
- si->swap_map[offset] = usage;
+ WRITE_ONCE(si->swap_map[offset], usage);
inc_cluster_info_page(si, si->cluster_info, offset);
unlock_cluster(ci);
@@ -928,12 +928,13 @@ done:
scan:
spin_unlock(&si->lock);
- while (++offset <= si->highest_bit) {
- if (!si->swap_map[offset]) {
+ while (++offset <= READ_ONCE(si->highest_bit)) {
+ if (data_race(!si->swap_map[offset])) {
spin_lock(&si->lock);
goto checks;
}
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (vm_swap_full() &&
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
spin_lock(&si->lock);
goto checks;
}
@@ -945,11 +946,12 @@ scan:
}
offset = si->lowest_bit;
while (offset < scan_base) {
- if (!si->swap_map[offset]) {
+ if (data_race(!si->swap_map[offset])) {
spin_lock(&si->lock);
goto checks;
}
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (vm_swap_full() &&
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
spin_lock(&si->lock);
goto checks;
}
@@ -1148,7 +1150,7 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
p = swp_swap_info(entry);
if (!p)
goto bad_nofile;
- if (!(p->flags & SWP_USED))
+ if (data_race(!(p->flags & SWP_USED)))
goto bad_device;
offset = swp_offset(entry);
if (offset >= p->max)
@@ -1174,7 +1176,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
p = __swap_info_get(entry);
if (!p)
goto out;
- if (!p->swap_map[swp_offset(entry)])
+ if (data_race(!p->swap_map[swp_offset(entry)]))
goto bad_free;
return p;
@@ -1243,7 +1245,10 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
}
usage = count | has_cache;
- p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+ if (usage)
+ WRITE_ONCE(p->swap_map[offset], usage);
+ else
+ WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
return usage;
}
@@ -1295,7 +1300,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
goto bad_nofile;
rcu_read_lock();
- if (!(si->flags & SWP_VALID))
+ if (data_race(!(si->flags & SWP_VALID)))
goto unlock_out;
offset = swp_offset(entry);
if (offset >= si->max)
@@ -3484,7 +3489,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
} else
err = -ENOENT; /* unused swap entry */
- p->swap_map[offset] = count | has_cache;
+ WRITE_ONCE(p->swap_map[offset], count | has_cache);
unlock_out:
unlock_cluster_or_swap_info(p, ci);