summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c282
1 files changed, 165 insertions, 117 deletions
diff --git a/mm/memory.c b/mm/memory.c
index c125c4969913..be44d0b36b18 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
set_pte_at(vma->vm_mm, address, ptep, pte);
- if (vma->vm_flags & VM_LOCKED)
- mlock_vma_page(page);
-
/*
* No need to invalidate - it was non-present before. However
* secondary CPUs may have mappings that need invalidating.
@@ -1309,22 +1306,34 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
- struct address_space *zap_mapping; /* Check page->mapping if set */
struct folio *single_folio; /* Locked folio to be unmapped */
+ bool even_cows; /* Zap COWed private pages too? */
};
-/*
- * We set details->zap_mapping when we want to unmap shared but keep private
- * pages. Return true if skip zapping this page, false otherwise.
- */
-static inline bool
-zap_skip_check_mapping(struct zap_details *details, struct page *page)
+/* Whether we should zap all COWed (private) pages too */
+static inline bool should_zap_cows(struct zap_details *details)
{
- if (!details || !page)
- return false;
+ /* By default, zap all pages */
+ if (!details)
+ return true;
- return details->zap_mapping &&
- (details->zap_mapping != page_rmapping(page));
+ /* Or, we zap COWed pages only if the caller wants to */
+ return details->even_cows;
+}
+
+/* Decides whether we should zap this page with the page pointer specified */
+static inline bool should_zap_page(struct zap_details *details, struct page *page)
+{
+ /* If we can make a decision without *page.. */
+ if (should_zap_cows(details))
+ return true;
+
+ /* E.g. the caller passes NULL for the case of a zero page */
+ if (!page)
+ return true;
+
+ /* Otherwise we should only zap non-anon pages */
+ return !PageAnon(page);
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1349,6 +1358,8 @@ again:
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
+ struct page *page;
+
if (pte_none(ptent))
continue;
@@ -1356,10 +1367,8 @@ again:
break;
if (pte_present(ptent)) {
- struct page *page;
-
page = vm_normal_page(vma, addr, ptent);
- if (unlikely(zap_skip_check_mapping(details, page)))
+ if (unlikely(!should_zap_page(details, page)))
continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
@@ -1377,7 +1386,7 @@ again:
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
+ page_remove_rmap(page, vma, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page))) {
@@ -1391,34 +1400,32 @@ again:
entry = pte_to_swp_entry(ptent);
if (is_device_private_entry(entry) ||
is_device_exclusive_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
-
- if (unlikely(zap_skip_check_mapping(details, page)))
+ page = pfn_swap_entry_to_page(entry);
+ if (unlikely(!should_zap_page(details, page)))
continue;
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
-
if (is_device_private_entry(entry))
- page_remove_rmap(page, false);
-
+ page_remove_rmap(page, vma, false);
put_page(page);
- continue;
- }
-
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
- continue;
-
- if (!non_swap_entry(entry))
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entry, hence a private anon page */
+ if (!should_zap_cows(details))
+ continue;
rss[MM_SWAPENTS]--;
- else if (is_migration_entry(entry)) {
- struct page *page;
-
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ } else if (is_migration_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
+ if (!should_zap_page(details, page))
+ continue;
rss[mm_counter(page)]--;
+ } else if (is_hwpoison_entry(entry)) {
+ if (!should_zap_cows(details))
+ continue;
+ } else {
+ /* We should have covered all the swap entry types */
+ WARN_ON_ONCE(1);
}
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1705,7 +1712,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size)
{
- if (address < vma->vm_start || address + size > vma->vm_end ||
+ if (!range_in_vma(vma, address, address + size) ||
!(vma->vm_flags & VM_PFNMAP))
return;
@@ -1753,16 +1760,16 @@ static int validate_page_before_insert(struct page *page)
return 0;
}
-static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
if (!pte_none(*pte))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(mm, addr, pte, mk_pte(page, prot));
+ inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ page_add_file_rmap(page, vma, false);
+ set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
}
@@ -1776,7 +1783,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
- struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;
@@ -1785,17 +1791,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
if (retval)
goto out;
retval = -ENOMEM;
- pte = get_locked_pte(mm, addr, &ptl);
+ pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
#ifdef pte_index
-static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
+static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
@@ -1805,7 +1811,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
err = validate_page_before_insert(page);
if (err)
return err;
- return insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -1842,7 +1848,7 @@ more:
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
- int err = insert_page_in_batch_locked(mm, pte,
+ int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
pte_unmap_unlock(start_pte, pte_lock);
@@ -3098,7 +3104,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page, false);
+ page_remove_rmap(old_page, vma, false);
}
/* Free the old page.. */
@@ -3118,16 +3124,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
- lock_page(old_page); /* LRU manipulation */
- if (PageMlocked(old_page))
- munlock_vma_page(old_page);
- unlock_page(old_page);
- }
if (page_copied)
free_swap_cache(old_page);
put_page(old_page);
@@ -3291,19 +3287,35 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (PageAnon(vmf->page)) {
struct page *page = vmf->page;
- /* PageKsm() doesn't necessarily raise the page refcount */
- if (PageKsm(page) || page_count(page) != 1)
+ /*
+ * We have to verify under page lock: these early checks are
+ * just an optimization to avoid locking the page and freeing
+ * the swapcache if there is little hope that we can reuse.
+ *
+ * PageKsm() doesn't necessarily raise the page refcount.
+ */
+ if (PageKsm(page) || page_count(page) > 3)
+ goto copy;
+ if (!PageLRU(page))
+ /*
+ * Note: We cannot easily detect+handle references from
+ * remote LRU pagevecs or references to PageLRU() pages.
+ */
+ lru_add_drain();
+ if (page_count(page) > 1 + PageSwapCache(page))
goto copy;
if (!trylock_page(page))
goto copy;
- if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ if (PageKsm(page) || page_count(page) != 1) {
unlock_page(page);
goto copy;
}
/*
- * Ok, we've got the only map reference, and the only
- * page count reference, and the page is locked,
- * it's dark out, and we're wearing sunglasses. Hit it.
+ * Ok, we've got the only page reference from our mapping
+ * and the page is locked, it's dark out, and we're wearing
+ * sunglasses. Hit it.
*/
unlock_page(page);
wp_page_reuse(vmf);
@@ -3340,12 +3352,8 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
vma_interval_tree_foreach(vma, root, first_index, last_index) {
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
- zba = first_index;
- if (zba < vba)
- zba = vba;
- zea = last_index;
- if (zea > vea)
- zea = vea;
+ zba = max(first_index, vba);
+ zea = min(last_index, vea);
unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
@@ -3377,14 +3385,14 @@ void unmap_mapping_folio(struct folio *folio)
first_index = folio->index;
last_index = folio->index + folio_nr_pages(folio) - 1;
- details.zap_mapping = mapping;
+ details.even_cows = false;
details.single_folio = folio;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
last_index, &details);
- i_mmap_unlock_write(mapping);
+ i_mmap_unlock_read(mapping);
}
/**
@@ -3406,15 +3414,15 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t first_index = start;
pgoff_t last_index = start + nr - 1;
- details.zap_mapping = even_cows ? NULL : mapping;
+ details.even_cows = even_cows;
if (last_index < first_index)
last_index = ULONG_MAX;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
last_index, &details);
- i_mmap_unlock_write(mapping);
+ i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
@@ -3481,6 +3489,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
return 0;
}
+static inline bool should_try_to_free_swap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int fault_flags)
+{
+ if (!PageSwapCache(page))
+ return false;
+ if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
+ PageMlocked(page))
+ return true;
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * user. Try freeing the swapcache to get rid of the swapcache
+ * reference only in case it's likely that we'll be the exlusive user.
+ */
+ return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ page_count(page) == 2;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3599,21 +3626,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- /*
- * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
- * release the swapcache from under us. The page pin, and pte_same
- * test below, are not enough to exclude that. Even if it is still
- * swapcache, we need to check that the page's swap has not changed.
- */
- if (unlikely((!PageSwapCache(page) ||
- page_private(page) != entry.val)) && swapcache)
- goto out_page;
-
- page = ksm_might_need_to_copy(page, vma, vmf->address);
- if (unlikely(!page)) {
- ret = VM_FAULT_OOM;
- page = swapcache;
- goto out_page;
+ if (swapcache) {
+ /*
+ * Make sure try_to_free_swap or swapoff did not release the
+ * swapcache from under us. The page pin, and pte_same test
+ * below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not
+ * changed.
+ */
+ if (unlikely(!PageSwapCache(page) ||
+ page_private(page) != entry.val))
+ goto out_page;
+
+ /*
+ * KSM sometimes has to copy on read faults, for example, if
+ * page->index of !PageKSM() pages would be nonlinear inside the
+ * anon VMA -- PageKSM() is lost on actual swapout.
+ */
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ goto out_page;
+ }
+
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * owner. Try removing the extra reference from the local LRU
+ * pagevecs if required.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
+ !PageKsm(page) && !PageLRU(page))
+ lru_add_drain();
}
cgroup_throttle_swaprate(page, GFP_KERNEL);
@@ -3632,19 +3677,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/*
- * The page isn't present yet, go ahead with the fault.
- *
- * Be careful about the sequence of operations here.
- * To get its accounting right, reuse_swap_page() must be called
- * while the page is counted on swap but not yet in mapcount i.e.
- * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
- * must be called after the swap_free(), or it will never succeed.
+ * Remove the swap entry and conditionally try to free up the swapcache.
+ * We're already holding a reference on the page but haven't mapped it
+ * yet.
*/
+ swap_free(entry);
+ if (should_try_to_free_swap(page, vma, vmf->flags))
+ try_to_free_swap(page);
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+
+ /*
+ * Same logic as in do_wp_page(); however, optimize for fresh pages
+ * that are certainly not shared because we just allocated them without
+ * exposing them to the swapcache.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ (page != swapcache || page_count(page) == 1)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@@ -3670,10 +3721,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
- swap_free(entry);
- if (mem_cgroup_swap_full(page) ||
- (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
- try_to_free_swap(page);
unlock_page(page);
if (page != swapcache && swapcache) {
/*
@@ -3871,11 +3918,16 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
- if (ret & VM_FAULT_LOCKED)
+ vm_fault_t poisonret = VM_FAULT_HWPOISON;
+ if (ret & VM_FAULT_LOCKED) {
+ /* Retry if a clean page was removed from the cache. */
+ if (invalidate_inode_page(vmf->page))
+ poisonret = 0;
unlock_page(vmf->page);
+ }
put_page(vmf->page);
vmf->page = NULL;
- return VM_FAULT_HWPOISON;
+ return poisonret;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
@@ -3947,7 +3999,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
- page_add_file_rmap(page, true);
+ page_add_file_rmap(page, vma, true);
+
/*
* deposit and withdraw with pmd lock held
*/
@@ -3996,7 +4049,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
+ page_add_file_rmap(page, vma, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
@@ -4622,6 +4675,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK,
+ .real_address = address,
.flags = flags,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
@@ -5256,14 +5310,6 @@ void print_vma_addr(char *prefix, unsigned long ip)
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
- /*
- * Some code (nfs/sunrpc) uses socket ops on kernel memory while
- * holding the mmap_lock, this is safe because kernel memory doesn't
- * get paged out, therefore we'll never actually fault, and the
- * below annotations will generate false positives.
- */
- if (uaccess_kernel())
- return;
if (pagefault_disabled())
return;
__might_sleep(file, line);
@@ -5444,6 +5490,8 @@ long copy_huge_page_from_user(struct page *dst_page,
if (rc)
break;
+ flush_dcache_page(subpage);
+
cond_resched();
}
return ret_val;