diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 205 |
1 files changed, 117 insertions, 88 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f42bb51e023a..1cc4a5f4791e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -36,6 +36,7 @@ #include <linux/numa.h> #include <linux/page_owner.h> #include <linux/sched/sysctl.h> +#include <linux/memory-tiers.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -70,9 +71,8 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs) { if (!vma->vm_mm) /* vdso */ return false; @@ -119,13 +119,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma); + return shmem_huge_enabled(vma, !enforce_sysfs); - if (!hugepage_flags_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + /* Enforce sysfs THP requirements as necessary */ + if (enforce_sysfs && + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_flags_always()))) return false; /* Only regular file is valid */ @@ -164,7 +163,6 @@ retry: count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } - count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); @@ -176,6 +174,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); + count_vm_event(THP_ZERO_PAGE_ALLOC); return true; } @@ -772,8 +771,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - if (pgtable) - pgtable_trans_huge_deposit(mm, pmd, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); } @@ -1307,6 +1305,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; + struct folio *folio; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t orig_pmd = vmf->orig_pmd; @@ -1328,46 +1327,48 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } page = pmd_page(orig_pmd); + folio = page_folio(page); VM_BUG_ON_PAGE(!PageHead(page), page); /* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse; - if (!trylock_page(page)) { - get_page(page); + if (!folio_trylock(folio)) { + folio_get(folio); spin_unlock(vmf->ptl); - lock_page(page); + folio_lock(folio); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return 0; } - put_page(page); + folio_put(folio); } /* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) { - unlock_page(page); + folio_unlock(folio); goto reuse; } /* - * See do_wp_page(): we can only reuse the page exclusively if there are - * no additional references. Note that we always drain the LRU - * pagevecs immediately after adding a THP. + * See do_wp_page(): we can only reuse the folio exclusively if + * there are no additional references. Note that we always drain + * the LRU pagevecs immediately after adding a THP. */ - if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + if (folio_ref_count(folio) > + 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (page_count(page) == 1) { + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_ref_count(folio) == 1) { pmd_t entry; page_move_anon_rmap(page, vma); - unlock_page(page); + folio_unlock(folio); reuse: if (unlikely(unshare)) { spin_unlock(vmf->ptl); @@ -1382,7 +1383,7 @@ reuse: } unlock_fallback: - unlock_page(page); + folio_unlock(folio); spin_unlock(vmf->ptl); fallback: __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); @@ -1449,7 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) return NULL; if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) @@ -1479,7 +1480,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = NUMA_NO_NODE; - int target_nid, last_cpupid = -1; + int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); bool migrated = false; bool was_writable = pmd_savedwrite(oldpmd); int flags = 0; @@ -1500,7 +1501,12 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) flags |= TNF_NO_GROUP; page_nid = page_to_nid(page); - last_cpupid = page_cpupid_last(page); + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if (node_is_toptier(page_nid)) + last_cpupid = page_cpupid_last(page); target_nid = numa_migrate_prep(page, vma, haddr, page_nid, &flags); @@ -1824,6 +1830,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (prot_numa) { struct page *page; + bool toptier; /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -1836,13 +1843,18 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto unlock; page = pmd_page(*pmd); + toptier = node_is_toptier(page_to_nid(page)); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - node_is_toptier(page_to_nid(page))) + toptier) goto unlock; + + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical @@ -2029,7 +2041,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pgtable_t pgtable; pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; - bool anon_exclusive = false; + bool anon_exclusive = false, dirty = false; unsigned long addr; int i; @@ -2113,13 +2125,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = is_writable_migration_entry(entry); if (PageAnon(page)) anon_exclusive = is_readable_exclusive_migration_entry(entry); - young = false; + young = is_migration_entry_young(entry); + dirty = is_migration_entry_dirty(entry); soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { page = pmd_page(old_pmd); - if (pmd_dirty(old_pmd)) + if (pmd_dirty(old_pmd)) { + dirty = true; SetPageDirty(page); + } write = pmd_write(old_pmd); young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); @@ -2140,6 +2155,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * * In case we cannot clear PageAnonExclusive(), split the PMD * only and let try_to_migrate_one() fail later. + * + * See page_try_share_anon_rmap(): invalidate PMD first. */ anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) @@ -2171,6 +2188,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, else swp_entry = make_readable_migration_entry( page_to_pfn(page + i)); + if (young) + swp_entry = make_migration_entry_young(swp_entry); + if (dirty) + swp_entry = make_migration_entry_dirty(swp_entry); entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); @@ -2185,6 +2206,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); + /* NOTE: this may set soft-dirty too on some archs */ + if (dirty) + entry = pte_mkdirty(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) @@ -2288,25 +2312,11 @@ out: void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) + if (!pmd) return; - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) - return; - - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - __split_huge_pmd(vma, pmd, address, freeze, folio); } @@ -2334,24 +2344,23 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, + * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); } } -static void unmap_page(struct page *page) +static void unmap_folio(struct folio *folio) { - struct folio *folio = page_folio(page); enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); /* * Anon pages need migration entries to preserve them, but file @@ -2368,7 +2377,7 @@ static void remap_page(struct folio *folio, unsigned long nr) { int i = 0; - /* If unmap_page() uses try_to_migrate() on file, remove this check */ + /* If unmap_folio() uses try_to_migrate() on file, remove this check */ if (!folio_test_anon(folio)) return; for (;;) { @@ -2418,7 +2427,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * for example lock_page() which set PG_waiters. * * Note that for mapped sub-pages of an anonymous THP, - * PG_anon_exclusive has been cleared in unmap_page() and is stored in + * PG_anon_exclusive has been cleared in unmap_folio() and is stored in * the migration entry instead from where remap_page() will restore it. * We can still have PG_anon_exclusive set on effectively unmapped and * unreferenced sub-pages of an anonymous THP: we can simply drop @@ -2438,7 +2447,8 @@ static void __split_huge_page_tail(struct page *head, int tail, #ifdef CONFIG_64BIT (1L << PG_arch_2) | #endif - (1L << PG_dirty))); + (1L << PG_dirty) | + LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first tail page is compound_mapcount */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, @@ -2611,27 +2621,26 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); - struct page *head = &folio->page; - struct deferred_split *ds_queue = get_deferred_split_queue(head); - XA_STATE(xas, &head->mapping->i_pages, head->index); + struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; pgoff_t end; bool is_hzp; - VM_BUG_ON_PAGE(!PageLocked(head), head); - VM_BUG_ON_PAGE(!PageCompound(head), head); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - is_hzp = is_huge_zero_page(head); - VM_WARN_ON_ONCE_PAGE(is_hzp, head); + is_hzp = is_huge_zero_page(&folio->page); + VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); if (is_hzp) return -EBUSY; - if (PageWriteback(head)) + if (folio_test_writeback(folio)) return -EBUSY; - if (PageAnon(head)) { + if (folio_test_anon(folio)) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -2640,7 +2649,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = page_get_anon_vma(head); + anon_vma = folio_get_anon_vma(folio); if (!anon_vma) { ret = -EBUSY; goto out; @@ -2649,7 +2658,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) mapping = NULL; anon_vma_lock_write(anon_vma); } else { - mapping = head->mapping; + gfp_t gfp; + + mapping = folio->mapping; /* Truncated ? */ if (!mapping) { @@ -2657,8 +2668,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - xas_split_alloc(&xas, head, compound_order(head), - mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + gfp = current_gfp_context(mapping_gfp_mask(mapping) & + GFP_RECLAIM_MASK); + + if (folio_test_private(folio) && + !filemap_release_folio(folio, gfp)) { + ret = -EBUSY; + goto out; + } + + xas_split_alloc(&xas, folio, folio_order(folio), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -2672,7 +2691,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, * which cannot be nested inside the page tree lock. So note * end now: i_size itself may be changed at any moment, but - * head page lock is good enough to serialize the trimming. + * folio lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping)) @@ -2680,7 +2699,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* - * Racy check if we can split the page, before unmap_page() will + * Racy check if we can split the page, before unmap_folio() will * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { @@ -2688,38 +2707,38 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - unmap_page(head); + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { /* - * Check if the head page is present in page cache. - * We assume all tail are present too, if head is there. + * Check if the folio is present in page cache. + * We assume all tail are present too, if folio is there. */ xas_lock(&xas); xas_reset(&xas); - if (xas_load(&xas) != head) + if (xas_load(&xas) != folio) goto fail; } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - if (page_ref_freeze(head, 1 + extra_pins)) { - if (!list_empty(page_deferred_list(head))) { + if (folio_ref_freeze(folio, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(&folio->page))) { ds_queue->split_queue_len--; - list_del(page_deferred_list(head)); + list_del(page_deferred_list(&folio->page)); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { - int nr = thp_nr_pages(head); + int nr = folio_nr_pages(folio); - xas_split(&xas, head, thp_order(head)); - if (PageSwapBacked(head)) { - __mod_lruvec_page_state(head, NR_SHMEM_THPS, + xas_split(&xas, folio, folio_order(folio)); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { - __mod_lruvec_page_state(head, NR_FILE_THPS, + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } @@ -2983,7 +3002,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, /* FOLL_DUMP to ignore special (like zero) pages */ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) continue; if (!is_transparent_hugepage(page)) @@ -3175,6 +3194,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + /* See page_try_share_anon_rmap(): invalidate PMD first. */ anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive && page_try_share_anon_rmap(page)) { set_pmd_at(mm, address, pvmw->pmd, pmdval); @@ -3189,6 +3209,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); else entry = make_readable_migration_entry(page_to_pfn(page)); + if (pmd_young(pmdval)) + entry = make_migration_entry_young(entry); + if (pmd_dirty(pmdval)) + entry = make_migration_entry_dirty(entry); pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); @@ -3214,13 +3238,18 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); get_page(new); - pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot))); + pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); + if (!is_migration_entry_young(entry)) + pmde = pmd_mkold(pmde); + /* NOTE: this may contain setting soft-dirty on some archs */ + if (PageDirty(new) && is_migration_entry_dirty(entry)) + pmde = pmd_mkdirty(pmde); if (PageAnon(new)) { rmap_t rmap_flags = RMAP_COMPOUND; |