diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 282 |
1 files changed, 165 insertions, 117 deletions
diff --git a/mm/memory.c b/mm/memory.c index c125c4969913..be44d0b36b18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, ptep, pte); - if (vma->vm_flags & VM_LOCKED) - mlock_vma_page(page); - /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. @@ -1309,22 +1306,34 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct address_space *zap_mapping; /* Check page->mapping if set */ struct folio *single_folio; /* Locked folio to be unmapped */ + bool even_cows; /* Zap COWed private pages too? */ }; -/* - * We set details->zap_mapping when we want to unmap shared but keep private - * pages. Return true if skip zapping this page, false otherwise. - */ -static inline bool -zap_skip_check_mapping(struct zap_details *details, struct page *page) +/* Whether we should zap all COWed (private) pages too */ +static inline bool should_zap_cows(struct zap_details *details) { - if (!details || !page) - return false; + /* By default, zap all pages */ + if (!details) + return true; - return details->zap_mapping && - (details->zap_mapping != page_rmapping(page)); + /* Or, we zap COWed pages only if the caller wants to */ + return details->even_cows; +} + +/* Decides whether we should zap this page with the page pointer specified */ +static inline bool should_zap_page(struct zap_details *details, struct page *page) +{ + /* If we can make a decision without *page.. */ + if (should_zap_cows(details)) + return true; + + /* E.g. the caller passes NULL for the case of a zero page */ + if (!page) + return true; + + /* Otherwise we should only zap non-anon pages */ + return !PageAnon(page); } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1349,6 +1358,8 @@ again: arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; + struct page *page; + if (pte_none(ptent)) continue; @@ -1356,10 +1367,8 @@ again: break; if (pte_present(ptent)) { - struct page *page; - page = vm_normal_page(vma, addr, ptent); - if (unlikely(zap_skip_check_mapping(details, page))) + if (unlikely(!should_zap_page(details, page))) continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); @@ -1377,7 +1386,7 @@ again: mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { @@ -1391,34 +1400,32 @@ again: entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry) || is_device_exclusive_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); - - if (unlikely(zap_skip_check_mapping(details, page))) + page = pfn_swap_entry_to_page(entry); + if (unlikely(!should_zap_page(details, page))) continue; - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - if (is_device_private_entry(entry)) - page_remove_rmap(page, false); - + page_remove_rmap(page, vma, false); put_page(page); - continue; - } - - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) - continue; - - if (!non_swap_entry(entry)) + } else if (!non_swap_entry(entry)) { + /* Genuine swap entry, hence a private anon page */ + if (!should_zap_cows(details)) + continue; rss[MM_SWAPENTS]--; - else if (is_migration_entry(entry)) { - struct page *page; - + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } else if (is_migration_entry(entry)) { page = pfn_swap_entry_to_page(entry); + if (!should_zap_page(details, page)) + continue; rss[mm_counter(page)]--; + } else if (is_hwpoison_entry(entry)) { + if (!should_zap_cows(details)) + continue; + } else { + /* We should have covered all the swap entry types */ + WARN_ON_ONCE(1); } - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1705,7 +1712,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { - if (address < vma->vm_start || address + size > vma->vm_end || + if (!range_in_vma(vma, address, address + size) || !(vma->vm_flags & VM_PFNMAP)) return; @@ -1753,16 +1760,16 @@ static int validate_page_before_insert(struct page *page) return 0; } -static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { if (!pte_none(*pte)) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -1776,7 +1783,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { - struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; @@ -1785,17 +1791,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, if (retval) goto out; retval = -ENOMEM; - pte = get_locked_pte(mm, addr, &ptl); + pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1805,7 +1811,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, err = validate_page_before_insert(page); if (err) return err; - return insert_page_into_pte_locked(mm, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1842,7 +1848,7 @@ more: start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pte, + int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); @@ -3098,7 +3104,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); } /* Free the old page.. */ @@ -3118,16 +3124,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ mmu_notifier_invalidate_range_only_end(&range); if (old_page) { - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (page_copied && (vma->vm_flags & VM_LOCKED)) { - lock_page(old_page); /* LRU manipulation */ - if (PageMlocked(old_page)) - munlock_vma_page(old_page); - unlock_page(old_page); - } if (page_copied) free_swap_cache(old_page); put_page(old_page); @@ -3291,19 +3287,35 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (PageAnon(vmf->page)) { struct page *page = vmf->page; - /* PageKsm() doesn't necessarily raise the page refcount */ - if (PageKsm(page) || page_count(page) != 1) + /* + * We have to verify under page lock: these early checks are + * just an optimization to avoid locking the page and freeing + * the swapcache if there is little hope that we can reuse. + * + * PageKsm() doesn't necessarily raise the page refcount. + */ + if (PageKsm(page) || page_count(page) > 3) + goto copy; + if (!PageLRU(page)) + /* + * Note: We cannot easily detect+handle references from + * remote LRU pagevecs or references to PageLRU() pages. + */ + lru_add_drain(); + if (page_count(page) > 1 + PageSwapCache(page)) goto copy; if (!trylock_page(page)) goto copy; - if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { + if (PageSwapCache(page)) + try_to_free_swap(page); + if (PageKsm(page) || page_count(page) != 1) { unlock_page(page); goto copy; } /* - * Ok, we've got the only map reference, and the only - * page count reference, and the page is locked, - * it's dark out, and we're wearing sunglasses. Hit it. + * Ok, we've got the only page reference from our mapping + * and the page is locked, it's dark out, and we're wearing + * sunglasses. Hit it. */ unlock_page(page); wp_page_reuse(vmf); @@ -3340,12 +3352,8 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; - zba = first_index; - if (zba < vba) - zba = vba; - zea = last_index; - if (zea > vea) - zea = vea; + zba = max(first_index, vba); + zea = min(last_index, vea); unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, @@ -3377,14 +3385,14 @@ void unmap_mapping_folio(struct folio *folio) first_index = folio->index; last_index = folio->index + folio_nr_pages(folio) - 1; - details.zap_mapping = mapping; + details.even_cows = false; details.single_folio = folio; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); } /** @@ -3406,15 +3414,15 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; - details.zap_mapping = even_cows ? NULL : mapping; + details.even_cows = even_cows; if (last_index < first_index) last_index = ULONG_MAX; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); @@ -3481,6 +3489,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } +static inline bool should_try_to_free_swap(struct page *page, + struct vm_area_struct *vma, + unsigned int fault_flags) +{ + if (!PageSwapCache(page)) + return false; + if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || + PageMlocked(page)) + return true; + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * user. Try freeing the swapcache to get rid of the swapcache + * reference only in case it's likely that we'll be the exlusive user. + */ + return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + page_count(page) == 2; +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3599,21 +3626,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - /* - * Make sure try_to_free_swap or reuse_swap_page or swapoff did not - * release the swapcache from under us. The page pin, and pte_same - * test below, are not enough to exclude that. Even if it is still - * swapcache, we need to check that the page's swap has not changed. - */ - if (unlikely((!PageSwapCache(page) || - page_private(page) != entry.val)) && swapcache) - goto out_page; - - page = ksm_might_need_to_copy(page, vma, vmf->address); - if (unlikely(!page)) { - ret = VM_FAULT_OOM; - page = swapcache; - goto out_page; + if (swapcache) { + /* + * Make sure try_to_free_swap or swapoff did not release the + * swapcache from under us. The page pin, and pte_same test + * below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not + * changed. + */ + if (unlikely(!PageSwapCache(page) || + page_private(page) != entry.val)) + goto out_page; + + /* + * KSM sometimes has to copy on read faults, for example, if + * page->index of !PageKSM() pages would be nonlinear inside the + * anon VMA -- PageKSM() is lost on actual swapout. + */ + page = ksm_might_need_to_copy(page, vma, vmf->address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + goto out_page; + } + + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * owner. Try removing the extra reference from the local LRU + * pagevecs if required. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && + !PageKsm(page) && !PageLRU(page)) + lru_add_drain(); } cgroup_throttle_swaprate(page, GFP_KERNEL); @@ -3632,19 +3677,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* - * The page isn't present yet, go ahead with the fault. - * - * Be careful about the sequence of operations here. - * To get its accounting right, reuse_swap_page() must be called - * while the page is counted on swap but not yet in mapcount i.e. - * before page_add_anon_rmap() and swap_free(); try_to_free_swap() - * must be called after the swap_free(), or it will never succeed. + * Remove the swap entry and conditionally try to free up the swapcache. + * We're already holding a reference on the page but haven't mapped it + * yet. */ + swap_free(entry); + if (should_try_to_free_swap(page, vma, vmf->flags)) + try_to_free_swap(page); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + + /* + * Same logic as in do_wp_page(); however, optimize for fresh pages + * that are certainly not shared because we just allocated them without + * exposing them to the swapcache. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + (page != swapcache || page_count(page) == 1)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -3670,10 +3721,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - swap_free(entry); - if (mem_cgroup_swap_full(page) || - (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - try_to_free_swap(page); unlock_page(page); if (page != swapcache && swapcache) { /* @@ -3871,11 +3918,16 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; if (unlikely(PageHWPoison(vmf->page))) { - if (ret & VM_FAULT_LOCKED) + vm_fault_t poisonret = VM_FAULT_HWPOISON; + if (ret & VM_FAULT_LOCKED) { + /* Retry if a clean page was removed from the cache. */ + if (invalidate_inode_page(vmf->page)) + poisonret = 0; unlock_page(vmf->page); + } put_page(vmf->page); vmf->page = NULL; - return VM_FAULT_HWPOISON; + return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) @@ -3947,7 +3999,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); - page_add_file_rmap(page, true); + page_add_file_rmap(page, vma, true); + /* * deposit and withdraw with pmd lock held */ @@ -3996,7 +4049,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); + page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } @@ -4622,6 +4675,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, + .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), @@ -5256,14 +5310,6 @@ void print_vma_addr(char *prefix, unsigned long ip) #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void __might_fault(const char *file, int line) { - /* - * Some code (nfs/sunrpc) uses socket ops on kernel memory while - * holding the mmap_lock, this is safe because kernel memory doesn't - * get paged out, therefore we'll never actually fault, and the - * below annotations will generate false positives. - */ - if (uaccess_kernel()) - return; if (pagefault_disabled()) return; __might_sleep(file, line); @@ -5444,6 +5490,8 @@ long copy_huge_page_from_user(struct page *dst_page, if (rc) break; + flush_dcache_page(subpage); + cond_resched(); } return ret_val; |