summaryrefslogtreecommitdiff
path: root/mm/rmap.c
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2022-11-02 18:51:38 -0700
committerAndrew Morton <akpm@linux-foundation.org>2022-11-30 15:58:46 -0800
commitcb67f4282bf9693658dbda934a441ddbbb1446df (patch)
tree799f5d9f3b36ff8d844e90ace1f09a94a7ec3a5c /mm/rmap.c
parentdad6a5eb55564845aa17b8b20fa834af21e46c48 (diff)
downloadlinux-stable-cb67f4282bf9693658dbda934a441ddbbb1446df.tar.gz
mm,thp,rmap: simplify compound page mapcount handling
Compound page (folio) mapcount calculations have been different for anon and file (or shmem) THPs, and involved the obscure PageDoubleMap flag. And each huge mapping and unmapping of a file (or shmem) THP involved atomically incrementing and decrementing the mapcount of every subpage of that huge page, dirtying many struct page cachelines. Add subpages_mapcount field to the struct folio and first tail page, so that the total of subpage mapcounts is available in one place near the head: then page_mapcount() and total_mapcount() and page_mapped(), and their folio equivalents, are so quick that anon and file and hugetlb don't need to be optimized differently. Delete the unloved PageDoubleMap. page_add and page_remove rmap functions must now maintain the subpages_mapcount as well as the subpage _mapcount, when dealing with pte mappings of huge pages; and correct maintenance of NR_ANON_MAPPED and NR_FILE_MAPPED statistics still needs reading through the subpages, using nr_subpages_unmapped() - but only when first or last pmd mapping finds subpages_mapcount raised (double-map case, not the common case). But are those counts (used to decide when to split an anon THP, and in vmscan's pagecache_reclaimable heuristic) correctly maintained? Not quite: since page_remove_rmap() (and also split_huge_pmd()) is often called without page lock, there can be races when a subpage pte mapcount 0<->1 while compound pmd mapcount 0<->1 is scanning - races which the previous implementation had prevented. The statistics might become inaccurate, and even drift down until they underflow through 0. That is not good enough, but is better dealt with in a followup patch. Update a few comments on first and second tail page overlaid fields. hugepage_add_new_anon_rmap() has to "increment" compound_mapcount, but subpages_mapcount and compound_pincount are already correctly at 0, so delete its reinitialization of compound_pincount. A simple 100 X munmap(mmap(2GB, MAP_SHARED|MAP_POPULATE, tmpfs), 2GB) took 18 seconds on small pages, and used to take 1 second on huge pages, but now takes 119 milliseconds on huge pages. Mapping by pmds a second time used to take 860ms and now takes 92ms; mapping by pmds after mapping by ptes (when the scan is needed) used to take 870ms and now takes 495ms. But there might be some benchmarks which would show a slowdown, because tail struct pages now fall out of cache until final freeing checks them. Link: https://lkml.kernel.org/r/47ad693-717-79c8-e1ba-46c3a6602e48@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: James Houghton <jthoughton@google.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mina Almasry <almasrymina@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> Cc: Peter Xu <peterx@redhat.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yang Shi <shy828301@gmail.com> Cc: Zach O'Keefe <zokeefe@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c142
1 files changed, 81 insertions, 61 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 3b2d18bbdc44..f43339ea4970 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1085,6 +1085,24 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
return page_vma_mkclean_one(&pvmw);
}
+/*
+ * When mapping a THP's first pmd, or unmapping its last pmd, if that THP
+ * also has pte mappings, then those must be discounted: in order to maintain
+ * NR_ANON_MAPPED and NR_FILE_MAPPED statistics exactly, without any drift,
+ * and to decide when an anon THP should be put on the deferred split queue.
+ */
+static int nr_subpages_unmapped(struct page *head, int nr_subpages)
+{
+ int nr = nr_subpages;
+ int i;
+
+ /* Discount those subpages mapped by pte */
+ for (i = 0; i < nr_subpages; i++)
+ if (atomic_read(&head[i]._mapcount) >= 0)
+ nr--;
+ return nr;
+}
+
/**
* page_move_anon_rmap - move a page to our anon_vma
* @page: the page to move to our anon_vma
@@ -1194,6 +1212,7 @@ static void __page_check_anon_rmap(struct page *page,
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
+ int nr, nr_pages;
bool compound = flags & RMAP_COMPOUND;
bool first;
@@ -1202,28 +1221,32 @@ void page_add_anon_rmap(struct page *page,
else
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (compound) {
+ if (compound && PageTransHuge(page)) {
atomic_t *mapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
mapcount = compound_mapcount_ptr(page);
first = atomic_inc_and_test(mapcount);
+
+ nr = nr_pages = thp_nr_pages(page);
+ if (first && head_subpages_mapcount(page))
+ nr = nr_subpages_unmapped(page, nr_pages);
} else {
+ nr = 1;
+ if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+
+ atomic_inc(subpages_mapcount_ptr(head));
+ nr = !head_compound_mapcount(head);
+ }
first = atomic_inc_and_test(&page->_mapcount);
}
+
VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first) {
- int nr = compound ? thp_nr_pages(page) : 1;
- /*
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption
- * disabled.
- */
if (compound)
- __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pages);
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
@@ -1265,8 +1288,6 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
-
__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
} else {
/* increment count (starts at -1) */
@@ -1287,29 +1308,19 @@ void page_add_new_anon_rmap(struct page *page,
void page_add_file_rmap(struct page *page,
struct vm_area_struct *vma, bool compound)
{
- int i, nr = 0;
+ int nr = 0;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
- int nr_pages = thp_nr_pages(page);
+ int nr_pages;
- for (i = 0; i < nr_pages; i++) {
- if (atomic_inc_and_test(&page[i]._mapcount))
- nr++;
- }
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
- /*
- * It is racy to ClearPageDoubleMap in page_remove_file_rmap();
- * but page lock is held by all page_add_file_rmap() compound
- * callers, and SetPageDoubleMap below warns if !PageLocked:
- * so here is a place that DoubleMap can be safely cleared.
- */
- VM_WARN_ON_ONCE(!PageLocked(page));
- if (nr == nr_pages && PageDoubleMap(page))
- ClearPageDoubleMap(page);
+ nr = nr_pages = thp_nr_pages(page);
+ if (head_subpages_mapcount(page))
+ nr = nr_subpages_unmapped(page, nr_pages);
if (PageSwapBacked(page))
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
@@ -1318,11 +1329,15 @@ void page_add_file_rmap(struct page *page,
__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
nr_pages);
} else {
- if (PageTransCompound(page) && page_mapping(page)) {
- VM_WARN_ON_ONCE(!PageLocked(page));
- SetPageDoubleMap(compound_head(page));
+ bool pmd_mapped = false;
+
+ if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+
+ atomic_inc(subpages_mapcount_ptr(head));
+ pmd_mapped = head_compound_mapcount(head);
}
- if (atomic_inc_and_test(&page->_mapcount))
+ if (atomic_inc_and_test(&page->_mapcount) && !pmd_mapped)
nr++;
}
out:
@@ -1335,7 +1350,7 @@ out:
static void page_remove_file_rmap(struct page *page, bool compound)
{
- int i, nr = 0;
+ int nr = 0;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
@@ -1348,14 +1363,15 @@ static void page_remove_file_rmap(struct page *page, bool compound)
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
- int nr_pages = thp_nr_pages(page);
+ int nr_pages;
- for (i = 0; i < nr_pages; i++) {
- if (atomic_add_negative(-1, &page[i]._mapcount))
- nr++;
- }
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- goto out;
+ return;
+
+ nr = nr_pages = thp_nr_pages(page);
+ if (head_subpages_mapcount(page))
+ nr = nr_subpages_unmapped(page, nr_pages);
+
if (PageSwapBacked(page))
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
-nr_pages);
@@ -1363,17 +1379,25 @@ static void page_remove_file_rmap(struct page *page, bool compound)
__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
-nr_pages);
} else {
- if (atomic_add_negative(-1, &page->_mapcount))
+ bool pmd_mapped = false;
+
+ if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+
+ atomic_dec(subpages_mapcount_ptr(head));
+ pmd_mapped = head_compound_mapcount(head);
+ }
+ if (atomic_add_negative(-1, &page->_mapcount) && !pmd_mapped)
nr++;
}
-out:
+
if (nr)
__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
}
static void page_remove_anon_compound_rmap(struct page *page)
{
- int i, nr;
+ int nr, nr_pages;
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
return;
@@ -1385,27 +1409,19 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
+ nr = nr_pages = thp_nr_pages(page);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, -nr);
- if (TestClearPageDoubleMap(page)) {
- /*
- * Subpages can be mapped with PTEs too. Check how many of
- * them are still mapped.
- */
- for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
- if (atomic_add_negative(-1, &page[i]._mapcount))
- nr++;
- }
+ if (head_subpages_mapcount(page)) {
+ nr = nr_subpages_unmapped(page, nr_pages);
/*
* Queue the page for deferred split if at least one small
* page of the compound page is unmapped, but at least one
* small page is still mapped.
*/
- if (nr && nr < thp_nr_pages(page))
+ if (nr && nr < nr_pages)
deferred_split_huge_page(page);
- } else {
- nr = thp_nr_pages(page);
}
if (nr)
@@ -1423,6 +1439,8 @@ static void page_remove_anon_compound_rmap(struct page *page)
void page_remove_rmap(struct page *page,
struct vm_area_struct *vma, bool compound)
{
+ bool pmd_mapped = false;
+
lock_page_memcg(page);
if (!PageAnon(page)) {
@@ -1435,15 +1453,17 @@ void page_remove_rmap(struct page *page,
goto out;
}
+ if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+
+ atomic_dec(subpages_mapcount_ptr(head));
+ pmd_mapped = head_compound_mapcount(head);
+ }
+
/* page still mapped by someone else? */
- if (!atomic_add_negative(-1, &page->_mapcount))
+ if (!atomic_add_negative(-1, &page->_mapcount) || pmd_mapped)
goto out;
- /*
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption disabled.
- */
__dec_lruvec_page_state(page, NR_ANON_MAPPED);
if (PageTransCompound(page))
@@ -2569,8 +2589,8 @@ void hugepage_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
ClearHPageRestoreReserve(page);
__page_set_anon_rmap(page, vma, address, 1);
}