From 450cee4587f9c227f16b123f1e493f0efe64d2fc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 24 Mar 2020 15:55:22 +1100 Subject: mm: code cleanup for MADV_FREE Some comments for MADV_FREE is revised and added to help people understand the MADV_FREE code, especially the page flag, PG_swapbacked. This makes page_is_file_cache() isn't consistent with its comments. So the function is renamed to page_is_file_lru() to make them consistent again. All these are put in one patch as one logical change. Link: http://lkml.kernel.org/r/20200317100342.2730705-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: David Hildenbrand Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Suggested-by: David Rientjes Acked-by: David Rientjes Acked-by: Michal Hocko Acked-by: Pankaj Gupta Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Mel Gorman Cc: Minchan Kim Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/mprotect.c') diff --git a/mm/mprotect.c b/mm/mprotect.c index 311c0dadf71c..0fee14b39416 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -98,7 +98,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * it cannot move them all from MIGRATE_ASYNC * context. */ - if (page_is_file_cache(page) && PageDirty(page)) + if (page_is_file_lru(page) && PageDirty(page)) continue; /* -- cgit v1.2.1 From 9d658665883e84484a5717b83bc6b0d79f00d9bf Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 24 Mar 2020 15:55:32 +1100 Subject: mm: merge parameters for change_protection() change_protection() was used by either the NUMA or mprotect() code, there's one parameter for each of the callers (dirty_accountable and prot_numa). Further, these parameters are passed along the calls: - change_protection_range() - change_p4d_range() - change_pud_range() - change_pmd_range() - ... Now we introduce a flag for change_protect() and all these helpers to replace these parameters. Then we can avoid passing multiple parameters multiple times along the way. More importantly, it'll greatly simplify the work if we want to introduce any new parameters to change_protection(). In the follow up patches, a new parameter for userfaultfd write protection will be introduced. No functional change at all. Link: http://lkml.kernel.org/r/20200220163112.11409-7-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jerome Glisse Cc: Andrea Arcangeli Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mprotect.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm/mprotect.c') diff --git a/mm/mprotect.c b/mm/mprotect.c index 0fee14b39416..046e0889e65f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,12 +37,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; int target_node = NUMA_NO_NODE; + bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; /* * Can be called with only the mmap_sem for reading by @@ -188,7 +190,7 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; @@ -229,7 +231,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, __split_huge_pmd(vma, pmd, addr, false, NULL); } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, prot_numa); + newprot, cp_flags); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { @@ -244,7 +246,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* fall through, the trans huge pmd just split */ } this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); pages += this_pages; next: cond_resched(); @@ -260,7 +262,7 @@ next: static inline unsigned long change_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; @@ -272,7 +274,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pud++, addr = next, addr != end); return pages; @@ -280,7 +282,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, static inline unsigned long change_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; @@ -292,7 +294,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(vma, p4d, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (p4d++, addr = next, addr != end); return pages; @@ -300,7 +302,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -317,7 +319,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(vma, pgd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -330,14 +332,15 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { unsigned long pages; if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + pages = change_protection_range(vma, start, end, newprot, + cp_flags); return pages; } @@ -459,7 +462,7 @@ success: vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, - dirty_accountable, 0); + dirty_accountable ? MM_CP_DIRTY_ACCT : 0); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major -- cgit v1.2.1 From a6ac85184abb5c46bab860bf814ca1e27d79ca27 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 24 Mar 2020 15:55:32 +1100 Subject: userfaultfd: wp: apply _PAGE_UFFD_WP bit Firstly, introduce two new flags MM_CP_UFFD_WP[_RESOLVE] for change_protection() when used with uffd-wp and make sure the two new flags are exclusively used. Then, - For MM_CP_UFFD_WP: apply the _PAGE_UFFD_WP bit and remove _PAGE_RW when a range of memory is write protected by uffd - For MM_CP_UFFD_WP_RESOLVE: remove the _PAGE_UFFD_WP bit and recover _PAGE_RW when write protection is resolved from userspace And use this new interface in mwriteprotect_range() to replace the old MM_CP_DIRTY_ACCT. Do this change for both PTEs and huge PMDs. Then we can start to identify which PTE/PMD is write protected by general (e.g., COW or soft dirty tracking), and which is for userfaultfd-wp. Since we should keep the _PAGE_UFFD_WP when doing pte_modify(), add it into _PAGE_CHG_MASK as well. Meanwhile, since we have this new bit, we can be even more strict when detecting uffd-wp page faults in either do_wp_page() or wp_huge_pmd(). After we're with _PAGE_UFFD_WP, a special case is when a page is both protected by the general COW logic and also userfault-wp. Here the userfault-wp will have higher priority and will be handled first. Only after the uffd-wp bit is cleared on the PTE/PMD will we continue to handle the general COW. These are the steps on what will happen with such a page: 1. CPU accesses write protected shared page (so both protected by general COW and uffd-wp), blocked by uffd-wp first because in do_wp_page we'll handle uffd-wp first, so it has higher priority than general COW. 2. Uffd service thread receives the request, do UFFDIO_WRITEPROTECT to remove the uffd-wp bit upon the PTE/PMD. However here we still keep the write bit cleared. Notify the blocked CPU. 3. The blocked CPU resumes the page fault process with a fault retry, during retry it'll notice it was not with the uffd-wp bit this time but it is still write protected by general COW, then it'll go though the COW path in the fault handler, copy the page, apply write bit where necessary, and retry again. 4. The CPU will be able to access this page with write bit set. Link: http://lkml.kernel.org/r/20200220163112.11409-8-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Cc: Brian Geffon Cc: Pavel Emelyanov Cc: Mike Kravetz Cc: David Hildenbrand Cc: Martin Cracauer Cc: Mel Gorman Cc: Bobby Powers Cc: Mike Rapoport Cc: "Kirill A . Shutemov" Cc: Maya Gokhale Cc: Johannes Weiner Cc: Marty McFadden Cc: Denis Plotnikov Cc: Hugh Dickins Cc: "Dr . David Alan Gilbert" Cc: Jerome Glisse Cc: Rik van Riel Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mprotect.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm/mprotect.c') diff --git a/mm/mprotect.c b/mm/mprotect.c index 046e0889e65f..e4fa41a24bec 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -45,6 +45,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, int target_node = NUMA_NO_NODE; bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * Can be called with only the mmap_sem for reading by @@ -116,6 +118,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (preserve_write) ptent = pte_mk_savedwrite(ptent); + if (uffd_wp) { + ptent = pte_wrprotect(ptent); + ptent = pte_mkuffd_wp(ptent); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled + * by PF interrupt handler, then + * things like COW could be properly + * handled. + */ + ptent = pte_clear_uffd_wp(ptent); + } + /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && (pte_soft_dirty(ptent) || @@ -336,6 +351,8 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, { unsigned long pages; + BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); + if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else -- cgit v1.2.1 From b3e18fcabfa1a02bf5798cf18daff7b8aa800eca Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 24 Mar 2020 15:55:34 +1100 Subject: userfaultfd: wp: support swap and page migration For either swap and page migration, we all use the bit 2 of the entry to identify whether this entry is uffd write-protected. It plays a similar role as the existing soft dirty bit in swap entries but only for keeping the uffd-wp tracking for a specific PTE/PMD. Something special here is that when we want to recover the uffd-wp bit from a swap/migration entry to the PTE bit we'll also need to take care of the _PAGE_RW bit and make sure it's cleared, otherwise even with the _PAGE_UFFD_WP bit we can't trap it at all. In change_pte_range() we do nothing for uffd if the PTE is a swap entry. That can lead to data mismatch if the page that we are going to write protect is swapped out when sending the UFFDIO_WRITEPROTECT. This patch also applies/removes the uffd-wp bit even for the swap entries. Link: http://lkml.kernel.org/r/20200220163112.11409-11-peterx@redhat.com Signed-off-by: Peter Xu Cc: Andrea Arcangeli Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Jerome Glisse Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mprotect.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'mm/mprotect.c') diff --git a/mm/mprotect.c b/mm/mprotect.c index e4fa41a24bec..1d823b050329 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -139,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; - } else if (IS_ENABLED(CONFIG_MIGRATION)) { + } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); + pte_t newpte; if (is_write_migration_entry(entry)) { - pte_t newpte; /* * A protection check is difficult so * just be safe and disable write @@ -152,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - set_pte_at(vma->vm_mm, addr, pte, newpte); - - pages++; - } - - if (is_write_device_private_entry(entry)) { - pte_t newpte; - + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_write_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See * copy_one_pte() for explanation. */ make_device_private_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_pte_at(vma->vm_mm, addr, pte, newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else { + newpte = oldpte; + } + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } } -- cgit v1.2.1