summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/gup.c7
-rw-r--r--mm/hugetlb.c24
-rw-r--r--mm/kasan/kasan.h12
-rw-r--r--mm/kfence/core.c19
-rw-r--r--mm/kfence/kfence_test.c2
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c9
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/memory.c11
-rw-r--r--mm/migrate.c50
-rw-r--r--mm/mmap_lock.c4
-rw-r--r--mm/page_alloc.c82
-rw-r--r--mm/rmap.c39
-rw-r--r--mm/secretmem.c1
-rw-r--r--mm/shmem.c14
-rw-r--r--mm/slab.h17
-rw-r--r--mm/slub.c132
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/util.c10
-rw-r--r--mm/vmscan.c30
23 files changed, 257 insertions, 240 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 271f2ca862c8..f5561ea7d90a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -398,12 +398,12 @@ static void cgwb_release_workfn(struct work_struct *work)
blkcg_unpin_online(blkcg);
fprop_local_destroy_percpu(&wb->memcg_completions);
- percpu_ref_exit(&wb->refcnt);
spin_lock_irq(&cgwb_lock);
list_del(&wb->offline_node);
spin_unlock_irq(&cgwb_lock);
+ percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
kfree_rcu(wb, rcu);
diff --git a/mm/gup.c b/mm/gup.c
index 42b8b1fa6521..b94717977d17 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1558,9 +1558,12 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
gup_flags |= FOLL_WRITE;
/*
- * See check_vma_flags(): Will return -EFAULT on incompatible mappings
- * or with insufficient permissions.
+ * We want to report -EINVAL instead of -EFAULT for any permission
+ * problems or incompatible mappings.
*/
+ if (check_vma_flags(vma, gup_flags))
+ return -EINVAL;
+
return __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 924553aa8f78..8ea35ba6699f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2476,7 +2476,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
if (!rc) {
/*
* This indicates there is an entry in the reserve map
- * added by alloc_huge_page. We know it was added
+ * not added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
@@ -4660,7 +4660,9 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- restore_reserve_on_error(h, vma, haddr, new_page);
+ /* No restore in case of successful pagetable update (Break COW) */
+ if (new_page != old_page)
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -4776,7 +4778,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page = false;
+ bool new_page, new_pagecache_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -4799,6 +4801,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto out;
retry:
+ new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
/* Check for page in userfault range */
@@ -4842,6 +4845,7 @@ retry:
goto retry;
goto out;
}
+ new_pagecache_page = true;
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -4926,7 +4930,9 @@ backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
- restore_reserve_on_error(h, vma, haddr, page);
+ /* restore reserve for newly allocated pages not in page cache */
+ if (new_page && !new_pagecache_page)
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
@@ -5135,6 +5141,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
int ret = -ENOMEM;
struct page *page;
int writable;
+ bool new_pagecache_page = false;
if (is_continue) {
ret = -EFAULT;
@@ -5228,6 +5235,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
+ new_pagecache_page = true;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
@@ -5291,7 +5299,8 @@ out_release_unlock:
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ if (!new_pagecache_page)
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
@@ -5440,8 +5449,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
continue;
}
- refs = min3(pages_per_huge_page(h) - pfn_offset,
- (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
+ /* vaddr may not be aligned to PAGE_SIZE */
+ refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
+ (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
if (pages || vmas)
record_subpages_vmas(mem_map_offset(page, pfn_offset),
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 98e3059bfea4..d739cdd1621a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -9,6 +9,7 @@
#ifdef CONFIG_KASAN_HW_TAGS
#include <linux/static_key.h>
+#include "../slab.h"
DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
extern bool kasan_flag_async __ro_after_init;
@@ -387,6 +388,17 @@ static inline void kasan_unpoison(const void *addr, size_t size, bool init)
if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
return;
+ /*
+ * Explicitly initialize the memory with the precise object size to
+ * avoid overwriting the SLAB redzone. This disables initialization in
+ * the arch code and may thus lead to performance penalty. The penalty
+ * is accepted since SLAB redzones aren't enabled in production builds.
+ */
+ if (__slub_debug_enabled() &&
+ init && ((unsigned long)size & KASAN_GRANULE_MASK)) {
+ init = false;
+ memzero_explicit((void *)addr, size);
+ }
size = round_up(size, KASAN_GRANULE_SIZE);
hw_set_mem_tag_range((void *)addr, size, tag, init);
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index d7666ace9d2e..575c685aa642 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -734,6 +734,22 @@ void kfence_shutdown_cache(struct kmem_cache *s)
void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
/*
+ * Perform size check before switching kfence_allocation_gate, so that
+ * we don't disable KFENCE without making an allocation.
+ */
+ if (size > PAGE_SIZE)
+ return NULL;
+
+ /*
+ * Skip allocations from non-default zones, including DMA. We cannot
+ * guarantee that pages in the KFENCE pool will have the requested
+ * properties (e.g. reside in DMAable memory).
+ */
+ if ((flags & GFP_ZONEMASK) ||
+ (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+ return NULL;
+
+ /*
* allocation_gate only needs to become non-zero, so it doesn't make
* sense to continue writing to it and pay the associated contention
* cost, in case we have a large number of concurrent allocations.
@@ -757,9 +773,6 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
if (!READ_ONCE(kfence_enabled))
return NULL;
- if (size > PAGE_SIZE)
- return NULL;
-
return kfence_guarded_alloc(s, size, flags);
}
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 7f24b9bcb2ec..942cbc16ad26 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -852,7 +852,7 @@ static void kfence_test_exit(void)
tracepoint_synchronize_unregister();
}
-late_initcall(kfence_test_init);
+late_initcall_sync(kfence_test_init);
module_exit(kfence_test_exit);
MODULE_LICENSE("GPL v2");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 228a2fbe0657..73d46d16d575 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -290,7 +290,7 @@ static void hex_dump_object(struct seq_file *seq,
warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
kasan_disable_current();
warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
- HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
+ HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII);
kasan_enable_current();
}
@@ -1171,7 +1171,7 @@ static bool update_checksum(struct kmemleak_object *object)
kasan_disable_current();
kcsan_disable_current();
- object->checksum = crc32(0, (void *)object->pointer, object->size);
+ object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size);
kasan_enable_current();
kcsan_enable_current();
@@ -1246,7 +1246,7 @@ static void scan_block(void *_start, void *_end,
break;
kasan_disable_current();
- pointer = *ptr;
+ pointer = *(unsigned long *)kasan_reset_tag((void *)ptr);
kasan_enable_current();
untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
diff --git a/mm/madvise.c b/mm/madvise.c
index 6d3d348b17f4..5c065bc8b5f6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -862,10 +862,12 @@ static long madvise_populate(struct vm_area_struct *vma,
switch (pages) {
case -EINTR:
return -EINTR;
- case -EFAULT: /* Incompatible mappings / permissions. */
+ case -EINVAL: /* Incompatible mappings / permissions. */
return -EINVAL;
case -EHWPOISON:
return -EHWPOISON;
+ case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
+ return -EFAULT;
default:
pr_warn_once("%s: unhandled return value: %ld\n",
__func__, pages);
diff --git a/mm/memblock.c b/mm/memblock.c
index 0041ff62c584..de7b553baa50 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -947,7 +947,8 @@ static bool should_skip_region(struct memblock_type *type,
return true;
/* skip hotpluggable memory regions if needed */
- if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m) &&
+ !(flags & MEMBLOCK_HOTPLUG))
return true;
/* if we want mirror memory skip non-mirror memory regions */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ae1f5d0cb581..702a81dfe72d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3106,13 +3106,15 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
stock->cached_pgdat = pgdat;
} else if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
+ struct pglist_data *oldpg = stock->cached_pgdat;
+
if (stock->nr_slab_reclaimable_b) {
- mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
@@ -3574,7 +3576,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- cgroup_rstat_flush(memcg->css.cgroup);
+ /* mem_cgroup_threshold() calls here from irqsafe context */
+ cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
val = memcg_page_state(memcg, NR_FILE_PAGES) +
memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index eefd823deb67..470400cc7513 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1146,7 +1146,7 @@ static int __get_hwpoison_page(struct page *page)
* unexpected races caused by taking a page refcount.
*/
if (!HWPoisonHandlable(head))
- return 0;
+ return -EBUSY;
if (PageTransHuge(head)) {
/*
@@ -1199,9 +1199,15 @@ try_again:
}
goto out;
} else if (ret == -EBUSY) {
- /* We raced with freeing huge page to buddy, retry. */
- if (pass++ < 3)
+ /*
+ * We raced with (possibly temporary) unhandlable
+ * page, retry.
+ */
+ if (pass++ < 3) {
+ shake_page(p, 1);
goto try_again;
+ }
+ ret = -EIO;
goto out;
}
}
diff --git a/mm/memory.c b/mm/memory.c
index 747a01d495f2..25fc46e87214 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4026,8 +4026,17 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return ret;
}
- if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
+ if (vmf->prealloc_pte) {
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (likely(pmd_none(*vmf->pmd))) {
+ mm_inc_nr_ptes(vma->vm_mm);
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ vmf->prealloc_pte = NULL;
+ }
+ spin_unlock(vmf->ptl);
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
return VM_FAULT_OOM;
+ }
}
/* See comment in handle_pte_fault() */
diff --git a/mm/migrate.c b/mm/migrate.c
index 23cbd9de030b..7e240437e7d9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -537,54 +537,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
}
/*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page. We need something more
- * specialized.
- */
-static void __copy_gigantic_page(struct page *dst, struct page *src,
- int nr_pages)
-{
- int i;
- struct page *dst_base = dst;
- struct page *src_base = src;
-
- for (i = 0; i < nr_pages; ) {
- cond_resched();
- copy_highpage(dst, src);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
- }
-}
-
-void copy_huge_page(struct page *dst, struct page *src)
-{
- int i;
- int nr_pages;
-
- if (PageHuge(src)) {
- /* hugetlbfs page */
- struct hstate *h = page_hstate(src);
- nr_pages = pages_per_huge_page(h);
-
- if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
- __copy_gigantic_page(dst, src, nr_pages);
- return;
- }
- } else {
- /* thp page */
- BUG_ON(!PageTransHuge(src));
- nr_pages = thp_nr_pages(src);
- }
-
- for (i = 0; i < nr_pages; i++) {
- cond_resched();
- copy_highpage(dst + i, src + i);
- }
-}
-
-/*
* Copy the page to its new location
*/
void migrate_page_states(struct page *newpage, struct page *page)
@@ -2116,7 +2068,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
LIST_HEAD(migratepages);
new_page_t *new;
bool compound;
- unsigned int nr_pages = thp_nr_pages(page);
+ int nr_pages = thp_nr_pages(page);
/*
* PTE mapped THP or HugeTLB page can't reach here so the page could
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index f5852a058ce0..1854850b4b89 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -156,14 +156,14 @@ static inline void put_memcg_path_buf(void)
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
- preempt_disable(); \
+ local_lock(&memcg_paths.lock); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
- preempt_enable(); \
+ local_unlock(&memcg_paths.lock); \
} while (0)
#else /* !CONFIG_MEMCG */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b97e17806be..eeb3a9cb36bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -840,21 +840,24 @@ void init_mem_debugging_and_hardening(void)
}
#endif
- if (_init_on_alloc_enabled_early) {
- if (page_poisoning_requested)
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
- "will take precedence over init_on_alloc\n");
- else
- static_branch_enable(&init_on_alloc);
- }
- if (_init_on_free_enabled_early) {
- if (page_poisoning_requested)
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
- "will take precedence over init_on_free\n");
- else
- static_branch_enable(&init_on_free);
+ if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
+ page_poisoning_requested) {
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+ "will take precedence over init_on_alloc and init_on_free\n");
+ _init_on_alloc_enabled_early = false;
+ _init_on_free_enabled_early = false;
}
+ if (_init_on_alloc_enabled_early)
+ static_branch_enable(&init_on_alloc);
+ else
+ static_branch_disable(&init_on_alloc);
+
+ if (_init_on_free_enabled_early)
+ static_branch_enable(&init_on_free);
+ else
+ static_branch_disable(&init_on_free);
+
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
@@ -3450,19 +3453,10 @@ void free_unref_page_list(struct list_head *list)
* comment in free_unref_page.
*/
migratetype = get_pcppage_migratetype(page);
- if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
- if (unlikely(is_migrate_isolate(migratetype))) {
- list_del(&page->lru);
- free_one_page(page_zone(page), page, pfn, 0,
- migratetype, FPI_NONE);
- continue;
- }
-
- /*
- * Non-isolated types over MIGRATE_PCPTYPES get added
- * to the MIGRATE_MOVABLE pcp list.
- */
- set_pcppage_migratetype(page, MIGRATE_MOVABLE);
+ if (unlikely(is_migrate_isolate(migratetype))) {
+ list_del(&page->lru);
+ free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+ continue;
}
set_page_private(page, pfn);
@@ -3472,7 +3466,15 @@ void free_unref_page_list(struct list_head *list)
list_for_each_entry_safe(page, next, list, lru) {
pfn = page_private(page);
set_page_private(page, 0);
+
+ /*
+ * Non-isolated types over MIGRATE_PCPTYPES get added
+ * to the MIGRATE_MOVABLE pcp list.
+ */
migratetype = get_pcppage_migratetype(page);
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES))
+ migratetype = MIGRATE_MOVABLE;
+
trace_mm_page_free_batched(page);
free_unref_page_commit(page, pfn, migratetype, 0);
@@ -3820,7 +3822,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
-static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
@@ -5221,9 +5223,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
unsigned int alloc_flags = ALLOC_WMARK_LOW;
int nr_populated = 0, nr_account = 0;
- if (unlikely(nr_pages <= 0))
- return 0;
-
/*
* Skip populated array elements to determine if any pages need
* to be allocated before disabling IRQs.
@@ -5231,19 +5230,35 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
while (page_array && nr_populated < nr_pages && page_array[nr_populated])
nr_populated++;
+ /* No pages requested? */
+ if (unlikely(nr_pages <= 0))
+ goto out;
+
/* Already populated array? */
if (unlikely(page_array && nr_pages - nr_populated == 0))
- return nr_populated;
+ goto out;
/* Use the single page allocator for one page. */
if (nr_pages - nr_populated == 1)
goto failed;
+#ifdef CONFIG_PAGE_OWNER
+ /*
+ * PAGE_OWNER may recurse into the allocator to allocate space to
+ * save the stack with pagesets.lock held. Releasing/reacquiring
+ * removes much of the performance benefit of bulk allocation so
+ * force the caller to allocate one page at a time as it'll have
+ * similar performance to added complexity to the bulk allocator.
+ */
+ if (static_branch_unlikely(&page_owner_inited))
+ goto failed;
+#endif
+
/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
gfp &= gfp_allowed_mask;
alloc_gfp = gfp;
if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
- return 0;
+ goto out;
gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */
@@ -5311,6 +5326,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+out:
return nr_populated;
failed_irq:
@@ -5326,7 +5342,7 @@ failed:
nr_populated++;
}
- return nr_populated;
+ goto out;
}
EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
diff --git a/mm/rmap.c b/mm/rmap.c
index 795f9d5f8386..b9eb5c12f3fe 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1440,21 +1440,20 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/*
* If the page is mlock()d, we cannot swap it out.
*/
- if (!(flags & TTU_IGNORE_MLOCK)) {
- if (vma->vm_flags & VM_LOCKED) {
- /* PTE-mapped THP are never marked as mlocked */
- if (!PageTransCompound(page) ||
- (PageHead(page) && !PageDoubleMap(page))) {
- /*
- * Holding pte lock, we do *not* need
- * mmap_lock here
- */
- mlock_vma_page(page);
- }
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
+ if (!(flags & TTU_IGNORE_MLOCK) &&
+ (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * PTE-mapped THP are never marked as mlocked: so do
+ * not set it on a DoubleMap THP, nor on an Anon THP
+ * (which may still be PTE-mapped after DoubleMap was
+ * cleared). But stop unmapping even in those cases.
+ */
+ if (!PageTransCompound(page) || (PageHead(page) &&
+ !PageDoubleMap(page) && !PageAnon(page)))
+ mlock_vma_page(page);
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
}
/* Unexpected PMD-mapped THP? */
@@ -1986,8 +1985,10 @@ static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
*/
if (vma->vm_flags & VM_LOCKED) {
/*
- * PTE-mapped THP are never marked as mlocked, but
- * this function is never called when PageDoubleMap().
+ * PTE-mapped THP are never marked as mlocked; but
+ * this function is never called on a DoubleMap THP,
+ * nor on an Anon THP (which may still be PTE-mapped
+ * after DoubleMap was cleared).
*/
mlock_vma_page(page);
/*
@@ -2022,6 +2023,10 @@ void page_mlock(struct page *page)
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+ /* Anon THP are only marked as mlocked when singly mapped */
+ if (PageTransCompound(page) && PageAnon(page))
+ return;
+
rmap_walk(page, &rwc);
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index f77d25467a14..030f02ddc7c1 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -152,6 +152,7 @@ static void secretmem_freepage(struct page *page)
}
const struct address_space_operations secretmem_aops = {
+ .set_page_dirty = __set_page_dirty_no_writeback,
.freepage = secretmem_freepage,
.migratepage = secretmem_migratepage,
.isolate_page = secretmem_isolate_page,
diff --git a/mm/shmem.c b/mm/shmem.c
index 70d9ce294bb4..dacda7463d54 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1696,8 +1696,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
- struct swap_info_struct *si;
- struct page *page = NULL;
+ struct page *page;
swp_entry_t swap;
int error;
@@ -1705,12 +1704,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
swap = radix_to_swp_entry(*pagep);
*pagep = NULL;
- /* Prevent swapoff from happening to us. */
- si = get_swap_device(swap);
- if (!si) {
- error = EINVAL;
- goto failed;
- }
/* Look it up and read it in.. */
page = lookup_swap_cache(swap, NULL, 0);
if (!page) {
@@ -1772,8 +1765,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
swap_free(swap);
*pagep = page;
- if (si)
- put_swap_device(si);
return 0;
failed:
if (!shmem_confirm_swap(mapping, index, swap))
@@ -1784,9 +1775,6 @@ unlock:
put_page(page);
}
- if (si)
- put_swap_device(si);
-
return error;
}
diff --git a/mm/slab.h b/mm/slab.h
index 67e06637ff2e..58c01a34e5b8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -216,10 +216,18 @@ DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
extern void print_tracking(struct kmem_cache *s, void *object);
long validate_slab_cache(struct kmem_cache *s);
+static inline bool __slub_debug_enabled(void)
+{
+ return static_branch_unlikely(&slub_debug_enabled);
+}
#else
static inline void print_tracking(struct kmem_cache *s, void *object)
{
}
+static inline bool __slub_debug_enabled(void)
+{
+ return false;
+}
#endif
/*
@@ -229,11 +237,10 @@ static inline void print_tracking(struct kmem_cache *s, void *object)
*/
static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
{
-#ifdef CONFIG_SLUB_DEBUG
- VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
- if (static_branch_unlikely(&slub_debug_enabled))
+ if (IS_ENABLED(CONFIG_SLUB_DEBUG))
+ VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
+ if (__slub_debug_enabled())
return s->flags & flags;
-#endif
return false;
}
@@ -339,7 +346,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
continue;
page = virt_to_head_page(p[i]);
- objcgs = page_objcgs(page);
+ objcgs = page_objcgs_check(page);
if (!objcgs)
continue;
diff --git a/mm/slub.c b/mm/slub.c
index dc863c1ea324..f77d8cd79ef7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -26,7 +26,6 @@
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
-#include <linux/stackdepot.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/kfence.h>
@@ -120,25 +119,11 @@
*/
#ifdef CONFIG_SLUB_DEBUG
-
#ifdef CONFIG_SLUB_DEBUG_ON
DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
-
-static inline bool __slub_debug_enabled(void)
-{
- return static_branch_unlikely(&slub_debug_enabled);
-}
-
-#else /* CONFIG_SLUB_DEBUG */
-
-static inline bool __slub_debug_enabled(void)
-{
- return false;
-}
-
#endif /* CONFIG_SLUB_DEBUG */
static inline bool kmem_cache_debug(struct kmem_cache *s)
@@ -221,8 +206,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#define TRACK_ADDRS_COUNT 16
struct track {
unsigned long addr; /* Called from address */
-#ifdef CONFIG_STACKDEPOT
- depot_stack_handle_t handle;
+#ifdef CONFIG_STACKTRACE
+ unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
#endif
int cpu; /* Was running on cpu */
int pid; /* Pid context */
@@ -591,8 +576,8 @@ static void print_section(char *level, char *text, u8 *addr,
unsigned int length)
{
metadata_access_enable();
- print_hex_dump(level, kasan_reset_tag(text), DUMP_PREFIX_ADDRESS,
- 16, 1, addr, length, 1);
+ print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
+ 16, 1, kasan_reset_tag((void *)addr), length, 1);
metadata_access_disable();
}
@@ -626,27 +611,22 @@ static struct track *get_track(struct kmem_cache *s, void *object,
return kasan_reset_tag(p + alloc);
}
-#ifdef CONFIG_STACKDEPOT
-static depot_stack_handle_t save_stack_depot_trace(gfp_t flags)
-{
- unsigned long entries[TRACK_ADDRS_COUNT];
- depot_stack_handle_t handle;
- unsigned int nr_entries;
-
- nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 4);
- handle = stack_depot_save(entries, nr_entries, flags);
- return handle;
-}
-#endif
-
static void set_track(struct kmem_cache *s, void *object,
enum track_item alloc, unsigned long addr)
{
struct track *p = get_track(s, object, alloc);
if (addr) {
-#ifdef CONFIG_STACKDEPOT
- p->handle = save_stack_depot_trace(GFP_NOWAIT);
+#ifdef CONFIG_STACKTRACE
+ unsigned int nr_entries;
+
+ metadata_access_enable();
+ nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
+ TRACK_ADDRS_COUNT, 3);
+ metadata_access_disable();
+
+ if (nr_entries < TRACK_ADDRS_COUNT)
+ p->addrs[nr_entries] = 0;
#endif
p->addr = addr;
p->cpu = smp_processor_id();
@@ -673,19 +653,14 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time)
pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
-#ifdef CONFIG_STACKDEPOT
+#ifdef CONFIG_STACKTRACE
{
- depot_stack_handle_t handle;
- unsigned long *entries;
- unsigned int nr_entries;
-
- handle = READ_ONCE(t->handle);
- if (!handle) {
- pr_err("object allocation/free stack trace missing\n");
- } else {
- nr_entries = stack_depot_fetch(handle, &entries);
- stack_trace_print(entries, nr_entries, 0);
- }
+ int i;
+ for (i = 0; i < TRACK_ADDRS_COUNT; i++)
+ if (t->addrs[i])
+ pr_err("\t%pS\n", (void *)t->addrs[i]);
+ else
+ break;
}
#endif
}
@@ -1425,12 +1400,13 @@ check_slabs:
static int __init setup_slub_debug(char *str)
{
slab_flags_t flags;
+ slab_flags_t global_flags;
char *saved_str;
char *slab_list;
bool global_slub_debug_changed = false;
bool slab_list_specified = false;
- slub_debug = DEBUG_DEFAULT_FLAGS;
+ global_flags = DEBUG_DEFAULT_FLAGS;
if (*str++ != '=' || !*str)
/*
* No options specified. Switch on full debugging.
@@ -1442,7 +1418,7 @@ static int __init setup_slub_debug(char *str)
str = parse_slub_debug_flags(str, &flags, &slab_list, true);
if (!slab_list) {
- slub_debug = flags;
+ global_flags = flags;
global_slub_debug_changed = true;
} else {
slab_list_specified = true;
@@ -1451,16 +1427,18 @@ static int __init setup_slub_debug(char *str)
/*
* For backwards compatibility, a single list of flags with list of
- * slabs means debugging is only enabled for those slabs, so the global
- * slub_debug should be 0. We can extended that to multiple lists as
+ * slabs means debugging is only changed for those slabs, so the global
+ * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
+ * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
* long as there is no option specifying flags without a slab list.
*/
if (slab_list_specified) {
if (!global_slub_debug_changed)
- slub_debug = 0;
+ global_flags = slub_debug;
slub_debug_string = saved_str;
}
out:
+ slub_debug = global_flags;
if (slub_debug != 0 || slub_debug_string)
static_branch_enable(&slub_debug_enabled);
else
@@ -3261,6 +3239,16 @@ struct detached_freelist {
struct kmem_cache *s;
};
+static inline void free_nonslab_page(struct page *page, void *object)
+{
+ unsigned int order = compound_order(page);
+
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+ kfree_hook(object);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order));
+ __free_pages(page, order);
+}
+
/*
* This function progressively scans the array with free objects (with
* a limited look ahead) and extract objects belonging to the same
@@ -3297,9 +3285,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
if (!s) {
/* Handle kalloc'ed objects */
if (unlikely(!PageSlab(page))) {
- BUG_ON(!PageCompound(page));
- kfree_hook(object);
- __free_pages(page, compound_order(page));
+ free_nonslab_page(page, object);
p[size] = NULL; /* mark object processed */
return size;
}
@@ -4059,26 +4045,18 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
objp = fixup_red_left(s, objp);
trackp = get_track(s, objp, TRACK_ALLOC);
kpp->kp_ret = (void *)trackp->addr;
-#ifdef CONFIG_STACKDEPOT
- {
- depot_stack_handle_t handle;
- unsigned long *entries;
- unsigned int nr_entries;
-
- handle = READ_ONCE(trackp->handle);
- if (handle) {
- nr_entries = stack_depot_fetch(handle, &entries);
- for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
- kpp->kp_stack[i] = (void *)entries[i];
- }
+#ifdef CONFIG_STACKTRACE
+ for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+ kpp->kp_stack[i] = (void *)trackp->addrs[i];
+ if (!kpp->kp_stack[i])
+ break;
+ }
- trackp = get_track(s, objp, TRACK_FREE);
- handle = READ_ONCE(trackp->handle);
- if (handle) {
- nr_entries = stack_depot_fetch(handle, &entries);
- for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
- kpp->kp_free_stack[i] = (void *)entries[i];
- }
+ trackp = get_track(s, objp, TRACK_FREE);
+ for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+ kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
+ if (!kpp->kp_free_stack[i])
+ break;
}
#endif
#endif
@@ -4283,13 +4261,7 @@ void kfree(const void *x)
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
- unsigned int order = compound_order(page);
-
- BUG_ON(!PageCompound(page));
- kfree_hook(object);
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(page, order);
+ free_nonslab_page(page, object);
return;
}
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c56aa9ac050d..bc7cee6b2ec5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -628,13 +628,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!mask)
goto skip;
- /* Test swap type to make sure the dereference is safe */
- if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
- struct inode *inode = si->swap_file->f_mapping->host;
- if (inode_read_congested(inode))
- goto skip;
- }
-
do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
diff --git a/mm/util.c b/mm/util.c
index 99c6cc77de9e..9043d03750a7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -731,6 +731,16 @@ int __page_mapcount(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_mapcount);
+void copy_huge_page(struct page *dst, struct page *src)
+{
+ unsigned i, nr = compound_nr(src);
+
+ for (i = 0; i < nr; i++) {
+ cond_resched();
+ copy_highpage(nth_page(dst, i), nth_page(src, i));
+ }
+}
+
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4620df62f0ff..eeae2f6bc532 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -100,9 +100,12 @@ struct scan_control {
unsigned int may_swap:1;
/*
- * Cgroups are not reclaimed below their configured memory.low,
- * unless we threaten to OOM. If any cgroups are skipped due to
- * memory.low and nothing was reclaimed, go back for memory.low.
+ * Cgroup memory below memory.low is protected as long as we
+ * don't threaten to OOM. If any cgroup is reclaimed at
+ * reduced force or passed over entirely due to its memory.low
+ * setting (memcg_low_skipped), and nothing is reclaimed as a
+ * result, then go back for one more cycle that reclaims the protected
+ * memory (memcg_low_reclaim) to avert OOM.
*/
unsigned int memcg_low_reclaim:1;
unsigned int memcg_low_skipped:1;
@@ -2537,15 +2540,14 @@ out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long lruvec_size;
+ unsigned long low, min;
unsigned long scan;
- unsigned long protection;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- protection = mem_cgroup_protection(sc->target_mem_cgroup,
- memcg,
- sc->memcg_low_reclaim);
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg,
+ &min, &low);
- if (protection) {
+ if (min || low) {
/*
* Scale a cgroup's reclaim pressure by proportioning
* its current usage to its memory.low or memory.min
@@ -2576,6 +2578,15 @@ out:
* hard protection.
*/
unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
/* Avoid TOCTOU with earlier protection check */
cgroup_size = max(cgroup_size, protection);
@@ -4413,11 +4424,13 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
};
+ unsigned long pflags;
trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
sc.gfp_mask);
cond_resched();
+ psi_memstall_enter(&pflags);
fs_reclaim_acquire(sc.gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
@@ -4442,6 +4455,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
+ psi_memstall_leave(&pflags);
trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);