diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2022-06-28 16:54:05 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2022-06-28 16:54:05 +1000 |
commit | 5f9df76887bf8170e8844f1907c13fbbb30e9c36 (patch) | |
tree | 3b940f7e1c36605af69ffe3a3e9bc8fa53bd12a1 /mm/huge_memory.c | |
parent | 8f850ad6cf932b8b5125f9585cb397e08403cccd (diff) | |
parent | 84b494dcbcc015419e629ced664d3b737e83336e (diff) | |
download | linux-next-akpm-base.tar.gz |
# Conflicts:
# include/linux/pagevec.h
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 247 |
1 files changed, 205 insertions, 42 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9b90a8d7dfa..7b7eedc6d5dd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -69,21 +69,85 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool transparent_hugepage_active(struct vm_area_struct *vma) +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps, bool in_pf) { - /* The addr is used to check if the vma size fits */ - unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; + if (!vma->vm_mm) /* vdso */ + return false; + + /* + * Explicitly disabled through madvise or prctl, or some + * architectures may disable THP for some mappings, for + * example, s390 kvm. + * */ + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + /* + * If the hardware/firmware marked hugepage support disabled. + */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) + return false; - if (!transhuge_vma_suitable(vma, addr)) + /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ + if (vma_is_dax(vma)) + return in_pf; + + /* + * Special VMA and hugetlb VMA. + * Must be checked after dax since some dax mappings may have + * VM_MIXEDMAP set. + */ + if (vm_flags & VM_NO_KHUGEPAGED) + return false; + + /* + * Check alignment for file vma and size for both file and anon vma. + * + * Skip the check for page fault. Huge fault does the check in fault + * handlers. And this check is not suitable for huge PUD fault. + */ + if (!in_pf && + !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) return false; - if (vma_is_anonymous(vma)) - return __transparent_hugepage_enabled(vma); - if (vma_is_shmem(vma)) + + /* + * Enabled via shmem mount options or sysfs settings. + * Must be done before hugepage flags check since shmem has its + * own flags. + */ + if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma)) + + if (!hugepage_flags_enabled()) + return false; + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + return false; + + /* Only regular file is valid */ + if (!in_pf && file_thp_enabled(vma)) return true; - return false; + if (!vma_is_anonymous(vma)) + return false; + + if (vma_is_temporary_stack(vma)) + return false; + + /* + * THPeligible bit of smaps should show 1 for proper VMAs even + * though anon_vma is not initialized yet. + * + * Allow page fault since anon_vma may be not initialized until + * the first page fault. + */ + if (!vma->anon_vma) + return (smaps || in_pf); + + return true; } static bool get_huge_zero_page(void) @@ -423,10 +487,10 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); if (err) goto err_hzp_shrinker; - err = register_shrinker(&deferred_split_shrinker); + err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); if (err) goto err_split_shrinker; @@ -497,25 +561,125 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) } #ifdef CONFIG_MEMCG -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static struct shrinker deferred_split_shrinker; + +static inline struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct mem_cgroup *memcg = page_memcg(compound_head(page)); - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + if (mem_cgroup_disabled()) + return NULL; + if (&NODE_DATA(folio_nid(folio))->deferred_split_queue == queue) + return NULL; + return container_of(queue, struct mem_cgroup, deferred_split_queue); +} - if (memcg) - return &memcg->deferred_split_queue; - else - return &pgdat->deferred_split_queue; +static inline struct deferred_split *folio_memcg_split_queue(struct folio *folio) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + + return memcg ? &memcg->deferred_split_queue : NULL; +} + +static void thp_sq_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + spin_lock(&src->deferred_split_queue.split_queue_lock); + spin_lock_nested(&dst->deferred_split_queue.split_queue_lock, + SINGLE_DEPTH_NESTING); } + +static void thp_sq_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + int nid; + struct deferred_split *src_queue, *dst_queue; + + src_queue = &src->deferred_split_queue; + dst_queue = &dst->deferred_split_queue; + + if (!src_queue->split_queue_len) + return; + + list_splice_tail_init(&src_queue->split_queue, &dst_queue->split_queue); + dst_queue->split_queue_len += src_queue->split_queue_len; + src_queue->split_queue_len = 0; + + for_each_node(nid) + set_shrinker_bit(dst, nid, deferred_split_shrinker.id); +} + +static void thp_sq_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst) +{ + spin_unlock(&dst->deferred_split_queue.split_queue_lock); + spin_unlock(&src->deferred_split_queue.split_queue_lock); +} +DEFINE_MEMCG_REPARENT_OPS(thp_sq); #else -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static inline struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + return NULL; +} - return &pgdat->deferred_split_queue; +static inline struct deferred_split *folio_memcg_split_queue(struct folio *folio) +{ + return NULL; } #endif +static struct deferred_split *folio_split_queue(struct folio *folio) +{ + struct deferred_split *queue = folio_memcg_split_queue(folio); + + return queue ? : &NODE_DATA(folio_nid(folio))->deferred_split_queue; +} + +static struct deferred_split *folio_split_queue_lock(struct folio *folio) +{ + struct deferred_split *queue; + + rcu_read_lock(); +retry: + queue = folio_split_queue(folio); + spin_lock(&queue->split_queue_lock); + + if (unlikely(folio_split_queue_memcg(folio, queue) != folio_memcg(folio))) { + spin_unlock(&queue->split_queue_lock); + goto retry; + } + rcu_read_unlock(); + + return queue; +} + +static struct deferred_split * +folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) +{ + struct deferred_split *queue; + + rcu_read_lock(); +retry: + queue = folio_split_queue(folio); + spin_lock_irqsave(&queue->split_queue_lock, *flags); + + if (unlikely(folio_split_queue_memcg(folio, queue) != folio_memcg(folio))) { + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); + goto retry; + } + rcu_read_unlock(); + + return queue; +} + +static inline void split_queue_unlock(struct deferred_split *queue) +{ + spin_unlock(&queue->split_queue_lock); +} + +static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, + unsigned long flags) +{ + spin_unlock_irqrestore(&queue->split_queue_lock, flags); +} + void prep_transhuge_page(struct page *page) { /* @@ -726,7 +890,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - khugepaged_enter(vma, vma->vm_flags); + khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && @@ -2266,11 +2430,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, + * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); @@ -2455,7 +2619,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); - unlock_page_lruvec(lruvec); + lruvec_unlock(lruvec); /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, nr); @@ -2540,7 +2704,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); struct page *head = &folio->page; - struct deferred_split *ds_queue = get_deferred_split_queue(head); + struct deferred_split *ds_queue; XA_STATE(xas, &head->mapping->i_pages, head->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -2632,13 +2796,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&ds_queue->split_queue_lock); + ds_queue = folio_split_queue_lock(folio); if (page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { ds_queue->split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock(&ds_queue->split_queue_lock); + split_queue_unlock(ds_queue); if (mapping) { int nr = thp_nr_pages(head); @@ -2656,7 +2820,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __split_huge_page(page, list, end); ret = 0; } else { - spin_unlock(&ds_queue->split_queue_lock); + split_queue_unlock(ds_queue); fail: if (mapping) xas_unlock(&xas); @@ -2680,25 +2844,23 @@ out: void free_transhuge_page(struct page *page) { - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct deferred_split *ds_queue; unsigned long flags; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(page_folio(page), &flags); if (!list_empty(page_deferred_list(page))) { ds_queue->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { - struct deferred_split *ds_queue = get_deferred_split_queue(page); -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = page_memcg(compound_head(page)); -#endif + struct deferred_split *ds_queue; unsigned long flags; + struct folio *folio = page_folio(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); @@ -2715,18 +2877,19 @@ void deferred_split_huge_page(struct page *page) if (PageSwapCache(page)) return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (list_empty(page_deferred_list(page))) { + struct mem_cgroup *memcg; + + memcg = folio_split_queue_memcg(folio, ds_queue); count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &ds_queue->split_queue); ds_queue->split_queue_len++; -#ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, page_to_nid(page), - deferred_split_shrinker.id); -#endif + shrinker_id(&deferred_split_shrinker)); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, @@ -2906,7 +3069,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, } /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU); if (IS_ERR(page)) continue; |