diff options
139 files changed, 4388 insertions, 2486 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9e5bab29685f..a341ef853fc0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -692,7 +692,7 @@ kernel/dma/contiguous.c cma_pernuma=nn[MG] - [ARM64,KNL,CMA] + [KNL,CMA] Sets the size of kernel per-numa memory area for contiguous memory allocations. A value of 0 disables per-numa CMA altogether. And If this option is not diff --git a/MAINTAINERS b/MAINTAINERS index 6d0f112eb649..67be30a92cc1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4478,6 +4478,13 @@ S: Supported F: Documentation/filesystems/caching/cachefiles.rst F: fs/cachefiles/ +CACHESTAT: PAGE CACHE STATS FOR A FILE +M: Nhat Pham <nphamcs@gmail.com> +M: Johannes Weiner <hannes@cmpxchg.org> +L: linux-mm@kvack.org +S: Maintained +F: tools/testing/selftests/cachestat/test_cachestat.c + CADENCE MIPI-CSI2 BRIDGES M: Maxime Ripard <mripard@kernel.org> L: linux-media@vger.kernel.org diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 8ebacf37a8cf..1f13995d00d7 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -490,3 +490,4 @@ 558 common process_mrelease sys_process_mrelease 559 common futex_waitv sys_futex_waitv 560 common set_mempolicy_home_node sys_ni_syscall +561 common cachestat sys_cachestat diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index ac964612d8b0..8ebed8a13874 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -464,3 +464,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 037feba03a51..64a514f90131 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 451 +#define __NR_compat_syscalls 452 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 604a2053d006..d952a28463e0 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -907,6 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_cachestat 451 +__SYSCALL(__NR_cachestat, sys_cachestat) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 7e89968bd282..4c5ef9b20065 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -416,10 +416,9 @@ long get_mte_ctrl(struct task_struct *task) static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, struct iovec *kiov, unsigned int gup_flags) { - struct vm_area_struct *vma; void __user *buf = kiov->iov_base; size_t len = kiov->iov_len; - int ret; + int err = 0; int write = gup_flags & FOLL_WRITE; if (!access_ok(buf, len)) @@ -429,14 +428,16 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, return -EIO; while (len) { + struct vm_area_struct *vma; unsigned long tags, offset; void *maddr; - struct page *page = NULL; + struct page *page = get_user_page_vma_remote(mm, addr, + gup_flags, &vma); - ret = get_user_pages_remote(mm, addr, 1, gup_flags, &page, - &vma, NULL); - if (ret <= 0) + if (IS_ERR_OR_NULL(page)) { + err = page == NULL ? -EIO : PTR_ERR(page); break; + } /* * Only copy tags if the page has been mapped as PROT_MTE @@ -446,7 +447,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, * was never mapped with PROT_MTE. */ if (!(vma->vm_flags & VM_MTE)) { - ret = -EOPNOTSUPP; + err = -EOPNOTSUPP; put_page(page); break; } @@ -479,7 +480,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, kiov->iov_len = buf - kiov->iov_base; if (!kiov->iov_len) { /* check for error accessing the tracee's address space */ - if (ret <= 0) + if (err) return -EIO; else return -EFAULT; diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 4bb1b8f47298..7b889445e5c6 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1044,7 +1044,7 @@ static int kasan_handler(struct pt_regs *regs, unsigned long esr) bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; size_t size = KASAN_ESR_SIZE(esr); - u64 addr = regs->regs[0]; + void *addr = (void *)regs->regs[0]; u64 pc = regs->pc; kasan_report(addr, size, write, pc); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index cb21ccd7940d..d5047eef4295 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -317,7 +317,7 @@ static void report_tag_fault(unsigned long addr, unsigned long esr, * find out access size. */ bool is_write = !!(esr & ESR_ELx_WNR); - kasan_report(addr, 0, is_write, regs->pc); + kasan_report((void *)addr, 0, is_write, regs->pc); } #else /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 66e70ca47680..d560aef6aafa 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -410,8 +410,6 @@ void __init bootmem_init(void) arm64_hugetlb_cma_reserve(); #endif - dma_pernuma_cma_reserve(); - kvm_hyp_reserve(); /* diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 72c929d9902b..f8c74ffeeefb 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -371,3 +371,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index b1f3940bc298..4f504783371f 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -450,3 +450,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 820145e47350..858d22bf275c 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -456,3 +456,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/arch/mips/include/asm/fw/cfe/cfe_api.h b/arch/mips/include/asm/fw/cfe/cfe_api.h index 25df2f4deb31..b52a6a9c26f1 100644 --- a/arch/mips/include/asm/fw/cfe/cfe_api.h +++ b/arch/mips/include/asm/fw/cfe/cfe_api.h @@ -17,9 +17,6 @@ #include <linux/types.h> #include <linux/string.h> -typedef long intptr_t; - - /* * Constants */ diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 253ff994ed2e..1976317d4e8b 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -389,3 +389,4 @@ 448 n32 process_mrelease sys_process_mrelease 449 n32 futex_waitv sys_futex_waitv 450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node +451 n32 cachestat sys_cachestat diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 3f1886ad9d80..cfda2511badf 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -365,3 +365,4 @@ 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 n64 cachestat sys_cachestat diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 8f243e35a7b2..7692234c3768 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -438,3 +438,4 @@ 448 o32 process_mrelease sys_process_mrelease 449 o32 futex_waitv sys_futex_waitv 450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node +451 o32 cachestat sys_cachestat diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 0e42fceb2d5e..3c71fad78318 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -448,3 +448,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index a0be127475b1..8c0b08b7a80e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -537,3 +537,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 81d7185e2ae8..d19fb1f3007d 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -105,7 +105,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n, FOLL_WRITE | FOLL_LONGTERM, - mem->hpages + entry, NULL); + mem->hpages + entry); if (ret == n) { pinned += n; continue; diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index b68f47541169..a6935af2235c 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat sys_cachestat diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index da6dac36e959..9bd0a873f3b1 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2777,7 +2777,7 @@ static struct page *get_map_page(struct kvm *kvm, u64 uaddr) mmap_read_lock(kvm->mm); get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, - &page, NULL, NULL); + &page, NULL); mmap_read_unlock(kvm->mm); return page; } diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 2de85c977f54..97377e8c5025 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 4398cc6fb68d..faa835f3c54a 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -496,3 +496,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 320480a8db4f..bc0a3c941b35 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -455,3 +455,4 @@ 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node +451 i386 cachestat sys_cachestat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index c84d12608cd2..227538b0ce80 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -372,6 +372,7 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 21ca0a831b70..5d390df21440 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -214,7 +214,7 @@ static int __sgx_encl_add_page(struct sgx_encl *encl, if (!(vma->vm_flags & VM_MAYEXEC)) return -EACCES; - ret = get_user_pages(src, 1, 0, &src_page, NULL); + ret = get_user_pages(src, 1, 0, &src_page); if (ret < 1) return -EFAULT; diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 52c94ab5c205..2b69c3c035b6 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -421,3 +421,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c index ee7059399e9c..b7e488c1b5df 100644 --- a/drivers/dma-buf/heaps/system_heap.c +++ b/drivers/dma-buf/heaps/system_heap.c @@ -41,7 +41,7 @@ struct dma_heap_attachment { bool mapped; }; -#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO) +#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO | __GFP_RETRY_MAYFAIL) #define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \ | __GFP_NORETRY) & ~__GFP_RECLAIM) \ | __GFP_COMP) @@ -350,6 +350,9 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap, struct page *page, *tmp_page; int i, ret = -ENOMEM; + if (len / PAGE_SIZE > totalram_pages()) + return ERR_PTR(-ENOMEM); + buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!buffer) return ERR_PTR(-ENOMEM); diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 2220cdf6a3f6..3a9db030f98f 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -359,7 +359,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_device *bdev, struct ttm_tt *ttm struct page **pages = ttm->pages + pinned; r = get_user_pages(userptr, num_pages, write ? FOLL_WRITE : 0, - pages, NULL); + pages); if (r < 0) goto release_pages; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index f693bc753b6b..1bb7507325bc 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -111,7 +111,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, ret = pin_user_pages(start_page + got * PAGE_SIZE, num_pages - got, FOLL_LONGTERM | FOLL_WRITE, - p + got, NULL); + p + got); if (ret < 0) { mmap_read_unlock(current->mm); goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 2a5cac2658ec..84e0f41e7dfa 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -140,7 +140,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = pin_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + gup_flags, page_list); if (ret < 0) goto out; diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index f51ab2ccf151..e6e25f15567d 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -422,7 +422,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) umem->page_chunk[i].plist = plist; while (nents) { rv = pin_user_pages(first_page_va, nents, foll_flags, - plist, NULL); + plist); if (rv < 0) goto out_sem_up; diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 3c47846cc5ef..412ca96be128 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -786,7 +786,7 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, user->locked = 1; } rc = pin_user_pages_remote(pages->source_mm, uptr, npages, - user->gup_flags, user->upages, NULL, + user->gup_flags, user->upages, &user->locked); } if (rc <= 0) { @@ -1799,7 +1799,7 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, rc = pin_user_pages_remote( pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, - NULL, NULL); + NULL); mmap_read_unlock(pages->source_mm); if (rc != 1) { if (WARN_ON(rc >= 0)) diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 53001532e8e3..405b89ea1054 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -180,7 +180,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, data, size, dma->nr_pages); err = pin_user_pages(data & PAGE_MASK, dma->nr_pages, gup_flags, - dma->pages, NULL); + dma->pages); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index b836936e9747..378cf02a2aa1 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -185,7 +185,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma, #else *pageshift = PAGE_SHIFT; #endif - if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0) + if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page) <= 0) return -EFAULT; *paddr = page_to_phys(page); put_page(page); diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index de97e38c3b82..4d4405f058e8 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1052,7 +1052,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, goto out; pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, - page_list, NULL); + page_list); if (pinned != npages) { ret = pinned < 0 ? pinned : -ENOMEM; goto out; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 3d4dd9420c30..3d2d9a944906 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -562,7 +562,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, mmap_read_lock(mm); ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, - pages, NULL, NULL); + pages, NULL); if (ret > 0) { int i; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 8c1aefc865f0..61223fcbe82b 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -983,7 +983,7 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v, while (npages) { sz2pin = min_t(unsigned long, npages, list_size); pinned = pin_user_pages(cur_base, sz2pin, - gup_flags, page_list, NULL); + gup_flags, page_list); if (sz2pin != pinned) { if (pinned < 0) { ret = pinned; diff --git a/fs/exec.c b/fs/exec.c index a466e797c8e2..25c65b64544b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -220,7 +220,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, */ mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, - &page, NULL, NULL); + &page, NULL); mmap_read_unlock(bprm->mm); if (ret <= 0) return NULL; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ae4e51e91ee3..aca4b4811394 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2024,7 +2024,6 @@ static long wb_writeback(struct bdi_writeback *wb, struct blk_plug plug; blk_start_plug(&plug); - spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -2049,6 +2048,9 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !wb_over_bg_thresh(wb)) break; + + spin_lock(&wb->list_lock); + /* * Kupdate and background works are special and we want to * include all inodes that need writing. Livelock avoidance is @@ -2078,13 +2080,19 @@ static long wb_writeback(struct bdi_writeback *wb, * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ - if (progress) + if (progress) { + spin_unlock(&wb->list_lock); continue; + } + /* * No more inodes for IO, bail */ - if (list_empty(&wb->b_more_io)) + if (list_empty(&wb->b_more_io)) { + spin_unlock(&wb->list_lock); break; + } + /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise @@ -2096,9 +2104,7 @@ static long wb_writeback(struct bdi_writeback *wb, spin_unlock(&wb->list_lock); /* This function drops i_lock... */ inode_sleep_on_writeback(inode); - spin_lock(&wb->list_lock); } - spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work->nr_pages; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ecfdfb2529a3..90361a922cec 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -834,9 +834,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, break; } - /* Set numa allocation policy based on index */ - hugetlb_set_vma_policy(&pseudo_vma, inode, index); - /* addr is the offset within the file (zero based) */ addr = index * hpage_size; @@ -850,7 +847,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, rcu_read_unlock(); if (present) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - hugetlb_drop_vma_policy(&pseudo_vma); continue; } @@ -862,6 +858,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ + hugetlb_set_vma_policy(&pseudo_vma, inode, index); folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 48030899dc6e..34102fe63c0e 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1955,36 +1955,40 @@ undo_alloc: "attribute.%s", es); NVolSetErrors(vol); } - a = ctx->attr; + if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { ntfs_error(vol->sb, "Failed to truncate mft data attribute " "runlist.%s", es); NVolSetErrors(vol); } - if (mp_rebuilt && !IS_ERR(ctx->mrec)) { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + if (ctx) { + a = ctx->attr; + if (mp_rebuilt && !IS_ERR(ctx->mrec)) { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( a->data.non_resident.mapping_pairs_offset), old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), + a->data.non_resident.mapping_pairs_offset), rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " + ntfs_error(vol->sb, "Failed to restore mapping pairs " "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " "record.%s", es); - NVolSetErrors(vol); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } else if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to restore attribute search " + else if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to restore attribute search " "context.%s", es); - NVolSetErrors(vol); + NVolSetErrors(vol); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); } - if (ctx) - ntfs_attr_put_search_ctx(ctx); if (!IS_ERR(mrec)) unmap_mft_record(mft_ni); up_write(&mft_ni->runlist.lock); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 25b44b303b35..5d0cf59c4926 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -419,7 +419,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) char *notes; size_t i = 0; - strlcpy(prpsinfo.pr_psargs, saved_command_line, + strscpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); notes = kzalloc(notes_len, GFP_KERNEL); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 420510f6a545..6259dd432eeb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1689,23 +1689,23 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* watch out for wraparound */ start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + unsigned long end; + ret = mmap_read_lock_killable(mm); if (ret) goto out_free; start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); mmap_read_unlock(mm); + + end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); + if (end >= start_vaddr && end < mm->task_size) + end_vaddr = end; } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; - /* - * The odds are that this will stop walking way - * before end_vaddr, because the length of the - * user buffer is tracked in "pm", and the walk - * will stop when we hit the end of the buffer. - */ ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index bed3bb8b27fa..2b7f4c6fa77c 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -76,10 +76,100 @@ static int copy_bio_to_actor(struct bio *bio, return copied_bytes; } +static int squashfs_bio_read_cached(struct bio *fullbio, + struct address_space *cache_mapping, u64 index, int length, + u64 read_start, u64 read_end, int page_count) +{ + struct page *head_to_cache = NULL, *tail_to_cache = NULL; + struct block_device *bdev = fullbio->bi_bdev; + int start_idx = 0, end_idx = 0; + struct bvec_iter_all iter_all; + struct bio *bio = NULL; + struct bio_vec *bv; + int idx = 0; + int err = 0; + + bio_for_each_segment_all(bv, fullbio, iter_all) { + struct page *page = bv->bv_page; + + if (page->mapping == cache_mapping && PageUptodate(page)) { + idx++; + continue; + } + + /* + * We only use this when the device block size is the same as + * the page size, so read_start and read_end cover full pages. + * + * Compare these to the original required index and length to + * only cache pages which were requested partially, since these + * are the ones which are likely to be needed when reading + * adjacent blocks. + */ + if (idx == 0 && index != read_start) + head_to_cache = page; + else if (idx == page_count - 1 && index + length != read_end) + tail_to_cache = page; + + if (!bio || idx != end_idx) { + struct bio *new = bio_alloc_clone(bdev, fullbio, + GFP_NOIO, &fs_bio_set); + + if (bio) { + bio_trim(bio, start_idx * PAGE_SECTORS, + (end_idx - start_idx) * PAGE_SECTORS); + bio_chain(bio, new); + submit_bio(bio); + } + + bio = new; + start_idx = idx; + } + + idx++; + end_idx = idx; + } + + if (bio) { + bio_trim(bio, start_idx * PAGE_SECTORS, + (end_idx - start_idx) * PAGE_SECTORS); + err = submit_bio_wait(bio); + bio_put(bio); + } + + if (err) + return err; + + if (head_to_cache) { + int ret = add_to_page_cache_lru(head_to_cache, cache_mapping, + read_start, GFP_NOIO); + + if (!ret) { + SetPageUptodate(head_to_cache); + unlock_page(head_to_cache); + } + + } + + if (tail_to_cache) { + int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping, + read_end - PAGE_SIZE, GFP_NOIO); + + if (!ret) { + SetPageUptodate(tail_to_cache); + unlock_page(tail_to_cache); + } + } + + return 0; +} + static int squashfs_bio_read(struct super_block *sb, u64 index, int length, struct bio **biop, int *block_offset) { struct squashfs_sb_info *msblk = sb->s_fs_info; + struct inode *cache_inode = msblk->cache_inode; + struct address_space *cache_mapping = cache_inode ? cache_inode->i_mapping : NULL; const u64 read_start = round_down(index, msblk->devblksize); const sector_t block = read_start >> msblk->devblksize_log2; const u64 read_end = round_up(index + length, msblk->devblksize); @@ -99,13 +189,27 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, for (i = 0; i < page_count; ++i) { unsigned int len = min_t(unsigned int, PAGE_SIZE - offset, total_len); - struct page *page = alloc_page(GFP_NOIO); + struct page *page = NULL; + + if (cache_mapping) + page = find_get_page(cache_mapping, + read_start + i * PAGE_SIZE); + if (!page) + page = alloc_page(GFP_NOIO); if (!page) { error = -ENOMEM; goto out_free_bio; } - if (!bio_add_page(bio, page, len, offset)) { + + if (cache_mapping) { + /* + * Use the __ version to avoid merging since we need + * each page to be separate when we check for and avoid + * cached pages. + */ + __bio_add_page(bio, page, len, offset); + } else if (!bio_add_page(bio, page, len, offset)) { error = -EIO; goto out_free_bio; } @@ -113,7 +217,12 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, total_len -= len; } - error = submit_bio_wait(bio); + if (cache_mapping) + error = squashfs_bio_read_cached(bio, cache_mapping, index, + length, read_start, read_end, + page_count); + else + error = submit_bio_wait(bio); if (error) goto out_free_bio; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 72f6f4b37863..dfee65845d48 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -47,6 +47,7 @@ struct squashfs_sb_info { struct squashfs_cache *block_cache; struct squashfs_cache *fragment_cache; struct squashfs_cache *read_page; + struct inode *cache_inode; int next_meta_index; __le64 *id_table; __le64 *fragment_index; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index e090fae48e68..64d6bc95950b 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -329,6 +329,16 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) goto failed_mount; } + if (msblk->devblksize == PAGE_SIZE) { + msblk->cache_inode = new_inode(sb); + if (msblk->cache_inode == NULL) + goto failed_mount; + + set_nlink(msblk->cache_inode, 1); + msblk->cache_inode->i_size = OFFSET_MAX; + mapping_set_gfp_mask(msblk->cache_inode->i_mapping, GFP_NOFS); + } + msblk->stream = squashfs_decompressor_setup(sb, flags); if (IS_ERR(msblk->stream)) { err = PTR_ERR(msblk->stream); @@ -454,6 +464,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); + iput(msblk->cache_inode); msblk->thread_ops->destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); @@ -572,6 +583,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); + iput(sbi->cache_inode); sbi->thread_ops->destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 885f5395fcd0..567c547cf371 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -692,7 +692,6 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); -void cgroup_rstat_flush_atomic(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 31f114f486c4..7af9949828ff 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -168,12 +168,6 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page, } #endif /* CONFIG_DMA_CMA*/ -#ifdef CONFIG_DMA_PERNUMA_CMA -void dma_pernuma_cma_reserve(void); -#else -static inline void dma_pernuma_cma_reserve(void) { } -#endif /* CONFIG_DMA_PERNUMA_CMA */ - #ifdef CONFIG_DMA_DECLARE_COHERENT int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size); diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 481abf530b3c..6d5edef09d45 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -93,6 +93,15 @@ struct kmem_cache; bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); +#ifdef CONFIG_FAIL_PAGE_ALLOC +bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); +#else +static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return false; +} +#endif /* CONFIG_FAIL_PAGE_ALLOC */ + int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #ifdef CONFIG_FAILSLAB extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ed8cb537c6a7..665f06675c83 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -338,19 +338,12 @@ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); -extern void pm_restrict_gfp_mask(void); -extern void pm_restore_gfp_mask(void); - -extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); - -#ifdef CONFIG_PM_SLEEP -extern bool pm_suspended_storage(void); -#else -static inline bool pm_suspended_storage(void) +static inline bool gfp_has_io_fs(gfp_t gfp) { - return false; + return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } -#endif /* CONFIG_PM_SLEEP */ + +extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d041aa9f0fe..21f942025fec 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -133,9 +133,8 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, - struct page **, struct vm_area_struct **, - unsigned long *, unsigned long *, long, unsigned int, - int *); + struct page **, unsigned long *, unsigned long *, + long, unsigned int, int *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); @@ -306,9 +305,8 @@ static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, static inline long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, - struct vm_area_struct **vmas, unsigned long *position, - unsigned long *nr_pages, long i, unsigned int flags, - int *nonblocking) + unsigned long *position, unsigned long *nr_pages, + long i, unsigned int flags, int *nonblocking) { BUG(); return 0; @@ -757,14 +755,6 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio return folio->_hugetlb_subpool; } -/* - * hugetlb page subpool pointer located in hpage[2].hugetlb_subpool - */ -static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) -{ - return hugetlb_folio_subpool(page_folio(hpage)); -} - static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { @@ -1031,11 +1021,6 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio return NULL; } -static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) -{ - return NULL; -} - static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { diff --git a/include/linux/kasan.h b/include/linux/kasan.h index f7ef70661ce2..819b6bc8ac08 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -343,7 +343,7 @@ static inline void *kasan_reset_tag(const void *addr) * @is_write: whether the bad access is a write or a read * @ip: instruction pointer for the accessibility check or the bad access itself */ -bool kasan_report(unsigned long addr, size_t size, +bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip); #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 1fadb5f5978b..85559a34a098 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -455,7 +455,9 @@ void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); +void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); +void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); @@ -466,7 +468,9 @@ void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); +void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); +void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); @@ -482,13 +486,13 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, } /* Checks if a mas has not found anything */ -static inline bool mas_is_none(struct ma_state *mas) +static inline bool mas_is_none(const struct ma_state *mas) { return mas->node == MAS_NONE; } /* Checks if a mas has been paused */ -static inline bool mas_is_paused(struct ma_state *mas) +static inline bool mas_is_paused(const struct ma_state *mas) { return mas->node == MAS_PAUSE; } @@ -528,6 +532,19 @@ static inline void mas_reset(struct ma_state *mas) #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * mas_contiguous() - Iterate over a contiguous range of the maple tree. + * @__mas: Maple Tree operation state (maple_state) + * @__entry: Entry retrieved from the tree + * @__max: maximum index to retrieve from the tree + * + * When returned, mas->index and mas->last will hold the entire range of the + * entry. The loop will terminate on the first NULL encountered. + * + * Note: may return the zero entry. + */ +#define mas_contiguous(__mas, __entry, __max) \ + while (((__entry) = mas_find_range((__mas), (__max))) != NULL) /** * mas_set_range() - Set up Maple Tree operation state for a different index. @@ -616,7 +633,7 @@ static inline void mt_clear_in_rcu(struct maple_tree *mt) return; if (mt_external_lock(mt)) { - BUG_ON(!mt_lock_is_held(mt)); + WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); @@ -635,7 +652,7 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) return; if (mt_external_lock(mt)) { - BUG_ON(!mt_lock_is_held(mt)); + WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); @@ -670,10 +687,17 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); #ifdef CONFIG_DEBUG_MAPLE_TREE +enum mt_dump_format { + mt_dump_dec, + mt_dump_hex, +}; + extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; -void mt_dump(const struct maple_tree *mt); +void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); +void mas_dump(const struct ma_state *mas); +void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ @@ -681,7 +705,40 @@ void mt_cache_shrink(void); if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ - mt_dump(__tree); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_BUG_ON(__mas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_WR_BUG_ON(__wrmas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ @@ -690,8 +747,67 @@ void mt_cache_shrink(void); atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) + +#define MT_WARN_ON(__tree, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WARN_ON(__mas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) #else -#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) +#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) +#define MT_WARN_ON(__tree, __x) WARN_ON(__x) +#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) +#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ #endif /*_LINUX_MAPLE_TREE_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 222d7370134c..00a88cf947e1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1038,7 +1038,6 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_atomic(void); void mem_cgroup_flush_stats_ratelimited(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, @@ -1537,10 +1536,6 @@ static inline void mem_cgroup_flush_stats(void) { } -static inline void mem_cgroup_flush_stats_atomic(void) -{ -} - static inline void mem_cgroup_flush_stats_ratelimited(void) { } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 9fcbf5706595..04bc286eed42 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -326,9 +326,6 @@ static inline int remove_memory(u64 start, u64 size) static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); diff --git a/include/linux/mm.h b/include/linux/mm.h index 27ce77080c79..66032f0d515c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -866,11 +866,24 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) return mas_find(&vmi->mas, ULONG_MAX); } +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); +} + + static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } +static inline +struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) +{ + return mas_prev_range(&vmi->mas, 0); +} + static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; @@ -2353,6 +2366,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, unmap_mapping_range(mapping, holebegin, holelen, 0); } +static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, + unsigned long addr); + extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, @@ -2361,19 +2377,42 @@ extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + int *locked); + +static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, + unsigned long addr, + int gup_flags, + struct vm_area_struct **vmap) +{ + struct page *page; + struct vm_area_struct *vma; + int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); + + if (got < 0) + return ERR_PTR(got); + if (got == 0) + return NULL; + + vma = vma_lookup(mm, addr); + if (WARN_ON_ONCE(!vma)) { + put_page(page); + return ERR_PTR(-EINVAL); + } + + *vmap = vma; + return page; +} + long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, @@ -2422,6 +2461,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) +bool vma_needs_dirty_tracking(struct vm_area_struct *vma); int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { @@ -2994,12 +3034,6 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_range(unsigned long, int, unsigned long, - unsigned long, unsigned long, enum meminit_context, - struct vmem_altmap *, int migratetype); -extern void setup_per_zone_wmarks(void); -extern void calculate_min_free_kbytes(void); -extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); @@ -3020,11 +3054,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); -/* page_alloc.c */ -extern int min_free_kbytes; -extern int watermark_boost_factor; -extern int watermark_scale_factor; - /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); @@ -3471,9 +3500,58 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } + +extern unsigned int _debug_guardpage_minorder; +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool debug_guardpage_enabled(void) +{ + return static_branch_unlikely(&_debug_guardpage_enabled); +} + +static inline bool page_is_guard(struct page *page) +{ + if (!debug_guardpage_enabled()) + return false; + + return PageGuard(page); +} + +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype); +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) +{ + if (!debug_guardpage_enabled()) + return false; + return __set_page_guard(zone, page, order, migratetype); +} + +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype); +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) +{ + if (!debug_guardpage_enabled()) + return; + __clear_page_guard(zone, page, order, migratetype); +} + #else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool debug_guardpage_enabled(void) { return false; } +static inline bool page_is_guard(struct page *page) { return false; } +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { return false; } +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA @@ -3586,6 +3664,10 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -3678,11 +3760,6 @@ enum mf_action_page_type { MF_MSG_UNKNOWN, }; -/* - * Sysfs entries for memory failure handling statistics. - */ -extern const struct attribute_group memory_failure_attr_group; - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr_hint, @@ -3712,33 +3789,6 @@ static inline bool vma_is_special_huge(const struct vm_area_struct *vma) #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#ifdef CONFIG_DEBUG_PAGEALLOC -extern unsigned int _debug_guardpage_minorder; -DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); - -static inline unsigned int debug_guardpage_minorder(void) -{ - return _debug_guardpage_minorder; -} - -static inline bool debug_guardpage_enabled(void) -{ - return static_branch_unlikely(&_debug_guardpage_enabled); -} - -static inline bool page_is_guard(struct page *page) -{ - if (!debug_guardpage_enabled()) - return false; - - return PageGuard(page); -} -#else -static inline unsigned int debug_guardpage_minorder(void) { return 0; } -static inline bool debug_guardpage_enabled(void) { return false; } -static inline bool page_is_guard(struct page *page) { return false; } -#endif /* CONFIG_DEBUG_PAGEALLOC */ - #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index b8728d11c949..7c3e7b0b0e8f 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -8,10 +8,12 @@ struct page; struct vm_area_struct; struct mm_struct; +struct vma_iterator; void dump_page(struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); +void vma_iter_dump_tree(const struct vma_iterator *vmi); #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) @@ -74,6 +76,17 @@ void dump_mm(const struct mm_struct *mm); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_ONCE_MM(cond, mm) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + dump_mm(mm); \ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -90,6 +103,7 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a4889c9d4055..3a68326c9989 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1512,27 +1512,6 @@ static inline bool has_managed_dma(void) } #endif -/* These two functions are used to setup the per zone pages min values */ -struct ctl_table; - -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, - loff_t *); -int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, - size_t *, loff_t *); -extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, - size_t *, loff_t *); -int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int numa_zonelist_order_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -extern int percpu_pagelist_high_fraction; -extern char numa_zonelist_order[]; -#define NUMA_ZONELIST_ORDER_LEN 16 #ifndef CONFIG_NUMA diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a56308a9d1a4..c1ae5ebc375f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1078,8 +1078,6 @@ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, #else #define filemap_migrate_folio NULL #endif -void page_endio(struct page *page, bool is_write, int err); - void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index e58306783d8e..8bbd1639ae85 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -94,6 +94,8 @@ static inline int stack_depot_early_init(void) { return 0; } depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags, bool can_alloc); +void stack_depot_inc_count(depot_stack_handle_t handle); +void stack_depot_dec_count(depot_stack_handle_t handle); /** * stack_depot_save - Save a stack trace to stack depot @@ -110,6 +112,13 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); +#ifdef CONFIG_PAGE_OWNER +struct seq_file; +void *stack_start(struct seq_file *m, loff_t *ppos); +void *stack_next(struct seq_file *m, void *v, loff_t *ppos); +int stack_print(struct seq_file *m, void *v); +#endif + /** * stack_depot_fetch - Fetch a stack trace from stack depot * diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d0d4598a7b3f..76923051c03d 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -364,9 +364,6 @@ struct pbe { struct pbe *next; }; -/* mm/page_alloc.c */ -extern void mark_free_pages(struct zone *zone); - /** * struct platform_hibernation_ops - hibernation platform support * @@ -505,6 +502,11 @@ extern void pm_report_max_hw_sleep(u64 t); extern bool events_check_enabled; extern suspend_state_t pm_suspend_target_state; +static inline bool pm_suspended_storage(void) +{ + return !gfp_has_io_fs(gfp_allowed_mask); +} + extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); @@ -538,6 +540,7 @@ static inline void ksys_sync_helper(void) {} #define pm_notifier(fn, pri) do { (void)(fn); } while (0) +static inline bool pm_suspended_storage(void) { return false; } static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} static inline void pm_wakeup_clear(bool reset) {} diff --git a/include/linux/swap.h b/include/linux/swap.h index 3c69cb653cb9..b2128df5edea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -368,6 +368,7 @@ static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) } /* linux/mm/workingset.c */ +bool workingset_test_recent(void *shadow, bool file, bool *workingset); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 33a0ee3bcb2e..6648c07c4381 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -72,6 +72,8 @@ struct open_how; struct mount_attr; struct landlock_ruleset_attr; enum landlock_rule_type; +struct cachestat_range; +struct cachestat; #include <linux/types.h> #include <linux/aio_abi.h> @@ -1058,6 +1060,9 @@ asmlinkage long sys_memfd_secret(unsigned int flags); asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, unsigned long home_node, unsigned long flags); +asmlinkage long sys_cachestat(unsigned int fd, + struct cachestat_range __user *cstat_range, + struct cachestat __user *cstat, unsigned int flags); /* * Architecture-specific system calls diff --git a/include/linux/types.h b/include/linux/types.h index 688fb943556a..7e5a1fb7cfa1 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -35,6 +35,7 @@ typedef __kernel_uid16_t uid16_t; typedef __kernel_gid16_t gid16_t; typedef unsigned long uintptr_t; +typedef long intptr_t; #ifdef CONFIG_HAVE_UID16 /* This is defined by include/asm-{arch}/posix_types.h */ diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 45fa180cc56a..cd639fae9086 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_cachestat 451 +__SYSCALL(__NR_cachestat, sys_cachestat) + #undef __NR_syscalls -#define __NR_syscalls 451 +#define __NR_syscalls 452 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index f55bc680b5b0..a246e11988d5 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -4,6 +4,7 @@ #include <asm/mman.h> #include <asm-generic/hugetlb_encode.h> +#include <linux/types.h> #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -41,4 +42,17 @@ #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB +struct cachestat_range { + __u64 off; + __u64 len; +}; + +struct cachestat { + __u64 nr_cache; + __u64 nr_dirty; + __u64 nr_writeback; + __u64 nr_evicted; + __u64 nr_recently_evicted; +}; + #endif /* _UAPI_LINUX_MMAN_H */ diff --git a/init/Kconfig b/init/Kconfig index 32c24950c4ce..f7f65af4ee12 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1771,6 +1771,16 @@ config RSEQ If unsure, say Y. +config CACHESTAT_SYSCALL + bool "Enable cachestat() system call" if EXPERT + default y + help + Enable the cachestat system call, which queries the page cache + statistics of a file (number of cached pages, dirty pages, + pages marked for writeback, (recently) evicted pages). + + If unsure say Y here. + config DEBUG_RSEQ default n bool "Enabled debugging of rseq() system call" if EXPERT diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d46f72a5ef73..b56bda46a9eb 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1030,9 +1030,8 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) { unsigned long start, end, nr_pages; - struct vm_area_struct **vmas = NULL; struct page **pages = NULL; - int i, pret, ret = -ENOMEM; + int pret, ret = -ENOMEM; end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; @@ -1042,45 +1041,24 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) if (!pages) goto done; - vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - goto done; - ret = 0; mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages, vmas); - if (pret == nr_pages) { - /* don't support file backed memory */ - for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma_is_shmem(vma)) - continue; - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { - ret = -EOPNOTSUPP; - break; - } - } + pages); + if (pret == nr_pages) *npages = nr_pages; - } else { + else ret = pret < 0 ? pret : -EFAULT; - } + mmap_read_unlock(current->mm); if (ret) { - /* - * if we did partial map, or found file backed vmas, - * release any pages we did get - */ + /* if we did partial map, release any pages we did get */ if (pret > 0) unpin_user_pages(pages, pret); goto done; } ret = 0; done: - kvfree(vmas); if (ret < 0) { kvfree(pages); pages = ERR_PTR(ret); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9c4c55228567..2542c21b6b6d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -171,7 +171,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __diag_pop(); /* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) +static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) { int cpu; @@ -207,9 +207,8 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) } raw_spin_unlock_irqrestore(cpu_lock, flags); - /* if @may_sleep, play nice and yield if necessary */ - if (may_sleep && (need_resched() || - spin_needbreak(&cgroup_rstat_lock))) { + /* play nice and yield if necessary */ + if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { spin_unlock_irq(&cgroup_rstat_lock); if (!cond_resched()) cpu_relax(); @@ -236,26 +235,11 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) might_sleep(); spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); + cgroup_rstat_flush_locked(cgrp); spin_unlock_irq(&cgroup_rstat_lock); } /** - * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush() - * @cgrp: target cgroup - * - * This function can be called from any context. - */ -void cgroup_rstat_flush_atomic(struct cgroup *cgrp) -{ - unsigned long flags; - - spin_lock_irqsave(&cgroup_rstat_lock, flags); - cgroup_rstat_flush_locked(cgrp, false); - spin_unlock_irqrestore(&cgroup_rstat_lock, flags); -} - -/** * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold * @cgrp: target cgroup * @@ -269,7 +253,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) { might_sleep(); spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); + cgroup_rstat_flush_locked(cgrp); } /** diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 6677d0e64d27..79f83091e3a2 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -140,10 +140,10 @@ if DMA_CMA config DMA_PERNUMA_CMA bool "Enable separate DMA Contiguous Memory Area for each NUMA Node" - default NUMA && ARM64 + default NUMA help - Enable this option to get pernuma CMA areas so that devices like - ARM64 SMMU can get local memory by DMA coherent APIs. + Enable this option to get pernuma CMA areas so that NUMA devices + can get local memory by DMA coherent APIs. You can set the size of pernuma CMA by specifying "cma_pernuma=size" on the kernel's command line. diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 6ea80ae42622..26a8e5365fcd 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -128,7 +128,7 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) #endif #ifdef CONFIG_DMA_PERNUMA_CMA -void __init dma_pernuma_cma_reserve(void) +static void __init dma_pernuma_cma_reserve(void) { int nid; @@ -153,6 +153,10 @@ void __init dma_pernuma_cma_reserve(void) (unsigned long long)pernuma_size_bytes / SZ_1M, nid); } } +#else +static inline void __init dma_pernuma_cma_reserve(void) +{ +} #endif /** @@ -171,6 +175,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit) phys_addr_t selected_limit = limit; bool fixed = false; + dma_pernuma_cma_reserve(); + pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); if (size_cmdline != -1) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 59887c69d54c..cac3aef7c6f7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -365,7 +365,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; - struct vm_area_struct *vma; int ret; short *ptr; @@ -373,7 +372,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, - FOLL_WRITE, &page, &vma, NULL); + FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, @@ -474,10 +473,9 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, - &old_page, &vma, NULL); - if (ret <= 0) - return ret; + old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR_OR_NULL(old_page)) + return PTR_ERR(old_page); ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) @@ -2027,8 +2025,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, - NULL, NULL); + result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); if (result < 0) return result; diff --git a/kernel/fork.c b/kernel/fork.c index ed4e01daccaa..d17995934eb4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -252,23 +252,19 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) { int i; int ret; + int nr_charged = 0; - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) goto err; + nr_charged++; } return 0; err: - /* - * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is - * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will - * ignore this page. - */ - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + for (i = 0; i < nr_charged; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); return ret; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 490792b1066e..08455fb48ba6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -312,10 +312,10 @@ void __noreturn kthread_exit(long result) * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * - * If present complete @comp and the reuturn code to kthread_stop(). + * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of - * @comp can use this function exit safely. + * @comp can use this function to exit safely. * * Does not return. */ diff --git a/kernel/power/main.c b/kernel/power/main.c index 3113ec2f1db4..34fc8359145b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,33 @@ #include "power.h" #ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). + */ +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); +} unsigned int lock_system_sleep(void) { diff --git a/kernel/power/power.h b/kernel/power/power.h index b83c8d5e188d..ac14d1b463d1 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -216,6 +216,11 @@ static inline void suspend_test_finish(const char *label) {} /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); +void pm_restrict_gfp_mask(void); +void pm_restore_gfp_mask(void); +#else +static inline void pm_restrict_gfp_mask(void) {} +static inline void pm_restore_gfp_mask(void) {} #endif #ifdef CONFIG_HIGHMEM diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cd8b7b35f1e8..45ef0bf81c85 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1228,6 +1228,58 @@ unsigned int snapshot_additional_pages(struct zone *zone) return 2 * rtree; } +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + +static void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + unsigned long flags; + unsigned int order, t; + struct page *page; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + + if (page_zone(page) != zone) + continue; + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], buddy_list) { + unsigned long i; + + pfn = page_to_pfn(page); + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} + #ifdef CONFIG_HIGHMEM /** * count_free_highmem_pages - Compute the total number of free highmem pages. diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..04bfb1e4d377 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL(set_mempolicy_home_node); +COND_SYSCALL(cachestat); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bfe53e835524..a57de67f032f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2120,13 +2120,6 @@ static struct ctl_table vm_table[] = { }, #endif { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, - { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), @@ -2136,39 +2129,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_FOUR, }, { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_scale_factor", - .data = &watermark_scale_factor, - .maxlen = sizeof(watermark_scale_factor), - .mode = 0644, - .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_THREE_THOUSAND, - }, - { - .procname = "percpu_pagelist_high_fraction", - .data = &percpu_pagelist_high_fraction, - .maxlen = sizeof(percpu_pagelist_high_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, .maxlen = sizeof(sysctl_page_lock_unfairness), @@ -2223,24 +2183,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, #endif #ifdef CONFIG_SMP { @@ -2267,15 +2209,6 @@ static struct ctl_table vm_table[] = { .proc_handler = mmap_min_addr_handler, }, #endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..bdc2666e8d39 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -406,7 +406,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, return -EBUSY; ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, - &page, NULL, NULL); + &page, NULL); if (unlikely(ret <= 0)) { if (!fixup_fault) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ce51d4dc6803..f202648dead9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2302,9 +2302,13 @@ config TEST_XARRAY tristate "Test the XArray code at runtime" config TEST_MAPLE_TREE - depends on DEBUG_KERNEL - select DEBUG_MAPLE_TREE - tristate "Test the Maple Tree code at runtime" + tristate "Test the Maple Tree code at runtime or module load" + help + Enable this option to test the maple tree code functions at boot, or + when the module is loaded. Enable "Debug Maple Trees" will enable + more verbose output on failures. + + If unsure, say N. config TEST_RHASHTABLE tristate "Perform selftest on resizable hash table" diff --git a/lib/Makefile b/lib/Makefile index 876fcdeae34e..38f23f352736 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -30,7 +30,7 @@ endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o timerqueue.o xarray.o \ maple_tree.o idr.o extable.o irq_regs.o argv_split.o \ - flex_proportions.o ratelimit.o show_mem.o \ + flex_proportions.o ratelimit.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8ebc43d4cc8c..4eb220008f72 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -194,7 +194,7 @@ static void mas_set_height(struct ma_state *mas) unsigned int new_flags = mas->tree->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; - BUG_ON(mas->depth > MAPLE_HEIGHT_MAX); + MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX); new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; mas->tree->ma_flags = new_flags; } @@ -240,12 +240,12 @@ static inline void mas_set_err(struct ma_state *mas, long err) mas->node = MA_ERROR(err); } -static inline bool mas_is_ptr(struct ma_state *mas) +static inline bool mas_is_ptr(const struct ma_state *mas) { return mas->node == MAS_ROOT; } -static inline bool mas_is_start(struct ma_state *mas) +static inline bool mas_is_start(const struct ma_state *mas) { return mas->node == MAS_START; } @@ -425,28 +425,26 @@ static inline unsigned long mte_parent_slot_mask(unsigned long parent) } /* - * mas_parent_enum() - Return the maple_type of the parent from the stored + * mas_parent_type() - Return the maple_type of the parent from the stored * parent type. * @mas: The maple state - * @node: The maple_enode to extract the parent's enum + * @enode: The maple_enode to extract the parent's enum * Return: The node->parent maple_type */ static inline -enum maple_type mte_parent_enum(struct maple_enode *p_enode, - struct maple_tree *mt) +enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) { unsigned long p_type; - p_type = (unsigned long)p_enode; - if (p_type & MAPLE_PARENT_ROOT) - return 0; /* Validated in the caller. */ + p_type = (unsigned long)mte_to_node(enode)->parent; + if (WARN_ON(p_type & MAPLE_PARENT_ROOT)) + return 0; p_type &= MAPLE_NODE_MASK; - p_type = p_type & ~(MAPLE_PARENT_ROOT | mte_parent_slot_mask(p_type)); - + p_type &= ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ - if (mt_is_alloc(mt)) + if (mt_is_alloc(mas->tree)) return maple_arange_64; return maple_range_64; } @@ -454,14 +452,8 @@ enum maple_type mte_parent_enum(struct maple_enode *p_enode, return 0; } -static inline -enum maple_type mas_parent_enum(struct ma_state *mas, struct maple_enode *enode) -{ - return mte_parent_enum(ma_enode_ptr(mte_to_node(enode)->parent), mas->tree); -} - /* - * mte_set_parent() - Set the parent node and encode the slot + * mas_set_parent() - Set the parent node and encode the slot * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. @@ -470,16 +462,16 @@ enum maple_type mas_parent_enum(struct ma_state *mas, struct maple_enode *enode) * parent type. */ static inline -void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, - unsigned char slot) +void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, + const struct maple_enode *parent, unsigned char slot) { unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); - BUG_ON(p_type == maple_dense); - BUG_ON(p_type == maple_leaf_64); + MAS_BUG_ON(mas, p_type == maple_dense); + MAS_BUG_ON(mas, p_type == maple_leaf_64); switch (p_type) { case maple_range_64: @@ -671,22 +663,22 @@ static inline unsigned long *ma_gaps(struct maple_node *node, } /* - * mte_pivot() - Get the pivot at @piv of the maple encoded node. - * @mn: The maple encoded node. + * mas_pivot() - Get the pivot at @piv of the maple encoded node. + * @mas: The maple state. * @piv: The pivot. * * Return: the pivot at @piv of @mn. */ -static inline unsigned long mte_pivot(const struct maple_enode *mn, - unsigned char piv) +static inline unsigned long mas_pivot(struct ma_state *mas, unsigned char piv) { - struct maple_node *node = mte_to_node(mn); - enum maple_type type = mte_node_type(mn); + struct maple_node *node = mas_mn(mas); + enum maple_type type = mte_node_type(mas->node); - if (piv >= mt_pivots[type]) { - WARN_ON(1); + if (MAS_WARN_ON(mas, piv >= mt_pivots[type])) { + mas_set_err(mas, -EIO); return 0; } + switch (type) { case maple_arange_64: return node->ma64.pivot[piv]; @@ -971,8 +963,6 @@ static inline unsigned char ma_meta_end(struct maple_node *mn, static inline unsigned char ma_meta_gap(struct maple_node *mn, enum maple_type mt) { - BUG_ON(mt != maple_arange_64); - return mn->ma64.meta.gap; } @@ -1111,7 +1101,6 @@ static int mas_ascend(struct ma_state *mas) enum maple_type a_type; unsigned long min, max; unsigned long *pivots; - unsigned char offset; bool set_max = false, set_min = false; a_node = mas_mn(mas); @@ -1123,8 +1112,9 @@ static int mas_ascend(struct ma_state *mas) p_node = mte_parent(mas->node); if (unlikely(a_node == p_node)) return 1; - a_type = mas_parent_enum(mas, mas->node); - offset = mte_parent_slot(mas->node); + + a_type = mas_parent_type(mas, mas->node); + mas->offset = mte_parent_slot(mas->node); a_enode = mt_mk_node(p_node, a_type); /* Check to make sure all parent information is still accurate */ @@ -1132,7 +1122,6 @@ static int mas_ascend(struct ma_state *mas) return 1; mas->node = a_enode; - mas->offset = offset; if (mte_is_root(a_enode)) { mas->max = ULONG_MAX; @@ -1140,11 +1129,17 @@ static int mas_ascend(struct ma_state *mas) return 0; } + if (!mas->min) + set_min = true; + + if (mas->max == ULONG_MAX) + set_max = true; + min = 0; max = ULONG_MAX; do { p_enode = a_enode; - a_type = mas_parent_enum(mas, p_enode); + a_type = mas_parent_type(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); a_enode = mt_mk_node(a_node, a_type); @@ -1401,9 +1396,9 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) mas->min = 0; mas->max = ULONG_MAX; - mas->depth = 0; retry: + mas->depth = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { @@ -1631,6 +1626,7 @@ static inline unsigned long mas_max_gap(struct ma_state *mas) return mas_leaf_max_gap(mas); node = mas_mn(mas); + MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node, mt); if (offset == MAPLE_ARANGE64_META_MAX) return 0; @@ -1659,11 +1655,12 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, enum maple_type pmt; pnode = mte_parent(mas->node); - pmt = mas_parent_enum(mas, mas->node); + pmt = mas_parent_type(mas, mas->node); penode = mt_mk_node(pnode, pmt); pgaps = ma_gaps(pnode, pmt); ascend: + MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode, pmt); if (meta_offset == MAPLE_ARANGE64_META_MAX) meta_gap = 0; @@ -1691,7 +1688,7 @@ ascend: /* Go to the parent node. */ pnode = mte_parent(penode); - pmt = mas_parent_enum(mas, penode); + pmt = mas_parent_type(mas, penode); pgaps = ma_gaps(pnode, pmt); offset = mte_parent_slot(penode); penode = mt_mk_node(pnode, pmt); @@ -1718,7 +1715,7 @@ static inline void mas_update_gap(struct ma_state *mas) pslot = mte_parent_slot(mas->node); p_gap = ma_gaps(mte_parent(mas->node), - mas_parent_enum(mas, mas->node))[pslot]; + mas_parent_type(mas, mas->node))[pslot]; if (p_gap != max_gap) mas_parent_gap(mas, pslot, max_gap); @@ -1743,7 +1740,7 @@ static inline void mas_adopt_children(struct ma_state *mas, offset = ma_data_end(node, type, pivots, mas->max); do { child = mas_slot_locked(mas, slots, offset); - mte_set_parent(child, parent, offset); + mas_set_parent(mas, child, parent, offset); } while (offset--); } @@ -1767,7 +1764,7 @@ static inline void mas_replace(struct ma_state *mas, bool advanced) } else { offset = mte_parent_slot(mas->node); slots = ma_slots(mte_parent(mas->node), - mas_parent_enum(mas, mas->node)); + mas_parent_type(mas, mas->node)); old_enode = mas_slot_locked(mas, slots, offset); } @@ -1943,8 +1940,9 @@ static inline int mab_calc_split(struct ma_state *mas, * causes one node to be deficient. * NOTE: mt_min_slots is 1 based, b_end and split are zero. */ - while (((bn->pivot[split] - min) < slot_count - 1) && - (split < slot_count - 1) && (b_end - split > slot_min)) + while ((split < slot_count - 1) && + ((bn->pivot[split] - min) < slot_count - 1) && + (b_end - split > slot_min)) split++; } @@ -2347,7 +2345,8 @@ static inline void mas_topiary_range(struct ma_state *mas, void __rcu **slots; unsigned char offset; - MT_BUG_ON(mas->tree, mte_is_leaf(mas->node)); + MAS_BUG_ON(mas, mte_is_leaf(mas->node)); + slots = ma_slots(mas_mn(mas), mte_node_type(mas->node)); for (offset = start; offset <= end; offset++) { struct maple_enode *enode = mas_slot_locked(mas, slots, offset); @@ -2707,9 +2706,9 @@ static inline void mas_set_split_parent(struct ma_state *mas, return; if ((*slot) <= split) - mte_set_parent(mas->node, left, *slot); + mas_set_parent(mas, mas->node, left, *slot); else if (right) - mte_set_parent(mas->node, right, (*slot) - split - 1); + mas_set_parent(mas, mas->node, right, (*slot) - split - 1); (*slot)++; } @@ -3106,12 +3105,12 @@ static int mas_spanning_rebalance(struct ma_state *mas, mte_node_type(mast->orig_l->node)); mast->orig_l->depth++; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); - mte_set_parent(left, l_mas.node, slot); + mas_set_parent(mas, left, l_mas.node, slot); if (middle) - mte_set_parent(middle, l_mas.node, ++slot); + mas_set_parent(mas, middle, l_mas.node, ++slot); if (right) - mte_set_parent(right, l_mas.node, ++slot); + mas_set_parent(mas, right, l_mas.node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: @@ -3250,7 +3249,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; eparent = mt_mk_node(mte_parent(l_mas.node), - mas_parent_enum(&l_mas, l_mas.node)); + mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { unsigned char max_p = mt_pivots[mt]; @@ -3293,7 +3292,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end /* replace parent. */ offset = mte_parent_slot(mas->node); - mt = mas_parent_enum(&l_mas, l_mas.node); + mt = mas_parent_type(&l_mas, l_mas.node); parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); @@ -3338,8 +3337,8 @@ static inline bool mas_split_final_node(struct maple_subtree_state *mast, * The Big_node data should just fit in a single node. */ ancestor = mas_new_ma_node(mas, mast->bn); - mte_set_parent(mast->l->node, ancestor, mast->l->offset); - mte_set_parent(mast->r->node, ancestor, mast->r->offset); + mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); + mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); mte_to_node(ancestor)->parent = mas_mn(mas)->parent; mast->l->node = ancestor; @@ -4263,11 +4262,13 @@ done: static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { - while ((wr_mas->mas->last > wr_mas->end_piv) && - (wr_mas->offset_end < wr_mas->node_end)) - wr_mas->end_piv = wr_mas->pivots[++wr_mas->offset_end]; + while ((wr_mas->offset_end < wr_mas->node_end) && + (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) + wr_mas->offset_end++; - if (wr_mas->mas->last > wr_mas->end_piv) + if (wr_mas->offset_end < wr_mas->node_end) + wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; + else wr_mas->end_piv = wr_mas->mas->max; } @@ -4424,7 +4425,6 @@ static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas) } /* At this point, we are at the leaf node that needs to be altered. */ - wr_mas->end_piv = wr_mas->r_max; mas_wr_end_piv(wr_mas); if (!wr_mas->entry) @@ -4498,6 +4498,25 @@ exists: } +static inline void mas_rewalk(struct ma_state *mas, unsigned long index) +{ +retry: + mas_set(mas, index); + mas_state_walk(mas); + if (mas_is_start(mas)) + goto retry; +} + +static inline bool mas_rewalk_if_dead(struct ma_state *mas, + struct maple_node *node, const unsigned long index) +{ + if (unlikely(ma_dead_node(node))) { + mas_rewalk(mas, index); + return true; + } + return false; +} + /* * mas_prev_node() - Find the prev non-null entry at the same level in the * tree. The prev value will be mas->node[mas->offset] or MAS_NONE. @@ -4513,15 +4532,19 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) int offset, level; void __rcu **slots; struct maple_node *node; - struct maple_enode *enode; unsigned long *pivots; + unsigned long max; - if (mas_is_none(mas)) - return 0; + node = mas_mn(mas); + if (!mas->min) + goto no_entry; + + max = mas->min - 1; + if (max < min) + goto no_entry; level = 0; do { - node = mas_mn(mas); if (ma_is_root(node)) goto no_entry; @@ -4530,64 +4553,41 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) return 1; offset = mas->offset; level++; + node = mas_mn(mas); } while (!offset); offset--; mt = mte_node_type(mas->node); - node = mas_mn(mas); - slots = ma_slots(node, mt); - pivots = ma_pivots(node, mt); - if (unlikely(ma_dead_node(node))) - return 1; - - mas->max = pivots[offset]; - if (offset) - mas->min = pivots[offset - 1] + 1; - if (unlikely(ma_dead_node(node))) - return 1; - - if (mas->max < min) - goto no_entry_min; - while (level > 1) { level--; - enode = mas_slot(mas, slots, offset); + slots = ma_slots(node, mt); + mas->node = mas_slot(mas, slots, offset); if (unlikely(ma_dead_node(node))) return 1; - mas->node = enode; mt = mte_node_type(mas->node); node = mas_mn(mas); - slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); - offset = ma_data_end(node, mt, pivots, mas->max); + offset = ma_data_end(node, mt, pivots, max); if (unlikely(ma_dead_node(node))) return 1; - - if (offset) - mas->min = pivots[offset - 1] + 1; - - if (offset < mt_pivots[mt]) - mas->max = pivots[offset]; - - if (mas->max < min) - goto no_entry; } + slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); + pivots = ma_pivots(node, mt); if (unlikely(ma_dead_node(node))) return 1; + if (likely(offset)) + mas->min = pivots[offset - 1] + 1; + mas->max = max; mas->offset = mas_data_end(mas); if (unlikely(mte_dead_node(mas->node))) return 1; return 0; -no_entry_min: - mas->offset = offset; - if (offset) - mas->min = pivots[offset - 1] + 1; no_entry: if (unlikely(ma_dead_node(node))) return 1; @@ -4597,6 +4597,76 @@ no_entry: } /* + * mas_prev_slot() - Get the entry in the previous slot + * + * @mas: The maple state + * @max: The minimum starting range + * + * Return: The entry in the previous slot which is possibly NULL + */ +static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) +{ + void *entry; + void __rcu **slots; + unsigned long pivot; + enum maple_type type; + unsigned long *pivots; + struct maple_node *node; + unsigned long save_point = mas->index; + +retry: + node = mas_mn(mas); + type = mte_node_type(mas->node); + pivots = ma_pivots(node, type); + if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) + goto retry; + +again: + if (mas->min <= min) { + pivot = mas_safe_min(mas, pivots, mas->offset); + + if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) + goto retry; + + if (pivot <= min) + return NULL; + } + + if (likely(mas->offset)) { + mas->offset--; + mas->last = mas->index - 1; + mas->index = mas_safe_min(mas, pivots, mas->offset); + } else { + if (mas_prev_node(mas, min)) { + mas_rewalk(mas, save_point); + goto retry; + } + + if (mas_is_none(mas)) + return NULL; + + mas->last = mas->max; + node = mas_mn(mas); + type = mte_node_type(mas->node); + pivots = ma_pivots(node, type); + mas->index = pivots[mas->offset - 1] + 1; + } + + slots = ma_slots(node, type); + entry = mas_slot(mas, slots, mas->offset); + if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) + goto retry; + + if (likely(entry)) + return entry; + + if (!empty) + goto again; + + return entry; +} + +/* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state * @max: The maximum pivot value to check. @@ -4607,11 +4677,10 @@ no_entry: static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long max) { - unsigned long min, pivot; + unsigned long min; unsigned long *pivots; struct maple_enode *enode; int level = 0; - unsigned char offset; unsigned char node_end; enum maple_type mt; void __rcu **slots; @@ -4619,19 +4688,16 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, if (mas->max >= max) goto no_entry; + min = mas->max + 1; level = 0; do { if (ma_is_root(node)) goto no_entry; - min = mas->max + 1; - if (min > max) - goto no_entry; - + /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; - offset = mas->offset; level++; node = mas_mn(mas); mt = mte_node_type(mas->node); @@ -4640,36 +4706,37 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, if (unlikely(ma_dead_node(node))) return 1; - } while (unlikely(offset == node_end)); + } while (unlikely(mas->offset == node_end)); slots = ma_slots(node, mt); - pivot = mas_safe_pivot(mas, pivots, ++offset, mt); - while (unlikely(level > 1)) { - /* Descend, if necessary */ - enode = mas_slot(mas, slots, offset); - if (unlikely(ma_dead_node(node))) - return 1; + mas->offset++; + enode = mas_slot(mas, slots, mas->offset); + if (unlikely(ma_dead_node(node))) + return 1; - mas->node = enode; + if (level > 1) + mas->offset = 0; + + while (unlikely(level > 1)) { level--; + mas->node = enode; node = mas_mn(mas); mt = mte_node_type(mas->node); slots = ma_slots(node, mt); - pivots = ma_pivots(node, mt); + enode = mas_slot(mas, slots, 0); if (unlikely(ma_dead_node(node))) return 1; - - offset = 0; - pivot = pivots[0]; } - enode = mas_slot(mas, slots, offset); + if (!mas->offset) + pivots = ma_pivots(node, mt); + + mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); if (unlikely(ma_dead_node(node))) return 1; mas->node = enode; mas->min = min; - mas->max = pivot; return 0; no_entry: @@ -4681,92 +4748,88 @@ no_entry: } /* - * mas_next_nentry() - Get the next node entry - * @mas: The maple state - * @max: The maximum value to check - * @*range_start: Pointer to store the start of the range. + * mas_next_slot() - Get the entry in the next slot * - * Sets @mas->offset to the offset of the next node entry, @mas->last to the - * pivot of the entry. + * @mas: The maple state + * @max: The maximum starting range + * @empty: Can be empty * - * Return: The next entry, %NULL otherwise + * Return: The entry in the next slot which is possibly NULL */ -static inline void *mas_next_nentry(struct ma_state *mas, - struct maple_node *node, unsigned long max, enum maple_type type) +static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { - unsigned char count; - unsigned long pivot; - unsigned long *pivots; void __rcu **slots; + unsigned long *pivots; + unsigned long pivot; + enum maple_type type; + struct maple_node *node; + unsigned char data_end; + unsigned long save_point = mas->last; void *entry; - if (mas->last == mas->max) { - mas->index = mas->max; - return NULL; - } - - slots = ma_slots(node, type); +retry: + node = mas_mn(mas); + type = mte_node_type(mas->node); pivots = ma_pivots(node, type); - count = ma_data_end(node, type, pivots, mas->max); - if (unlikely(ma_dead_node(node))) - return NULL; - - mas->index = mas_safe_min(mas, pivots, mas->offset); - if (unlikely(ma_dead_node(node))) - return NULL; - - if (mas->index > max) - return NULL; - - if (mas->offset > count) - return NULL; + data_end = ma_data_end(node, type, pivots, mas->max); + if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) + goto retry; - while (mas->offset < count) { - pivot = pivots[mas->offset]; - entry = mas_slot(mas, slots, mas->offset); - if (ma_dead_node(node)) - return NULL; +again: + if (mas->max >= max) { + if (likely(mas->offset < data_end)) + pivot = pivots[mas->offset]; + else + return NULL; /* must be mas->max */ - if (entry) - goto found; + if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) + goto retry; if (pivot >= max) return NULL; + } - mas->index = pivot + 1; + if (likely(mas->offset < data_end)) { + mas->index = pivots[mas->offset] + 1; mas->offset++; - } + if (likely(mas->offset < data_end)) + mas->last = pivots[mas->offset]; + else + mas->last = mas->max; + } else { + if (mas_next_node(mas, node, max)) { + mas_rewalk(mas, save_point); + goto retry; + } - if (mas->index > mas->max) { - mas->index = mas->last; - return NULL; + if (mas_is_none(mas)) + return NULL; + + mas->offset = 0; + mas->index = mas->min; + node = mas_mn(mas); + type = mte_node_type(mas->node); + pivots = ma_pivots(node, type); + mas->last = pivots[0]; } - pivot = mas_safe_pivot(mas, pivots, mas->offset, type); - entry = mas_slot(mas, slots, mas->offset); - if (ma_dead_node(node)) - return NULL; + slots = ma_slots(node, type); + entry = mt_slot(mas->tree, slots, mas->offset); + if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) + goto retry; - if (!pivot) - return NULL; + if (entry) + return entry; - if (!entry) - return NULL; + if (!empty) { + if (!mas->offset) + data_end = 2; + goto again; + } -found: - mas->last = pivot; return entry; } -static inline void mas_rewalk(struct ma_state *mas, unsigned long index) -{ -retry: - mas_set(mas, index); - mas_state_walk(mas); - if (mas_is_start(mas)) - goto retry; -} - /* * mas_next_entry() - Internal function to get the next entry. * @mas: The maple state @@ -4781,155 +4844,10 @@ retry: */ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) { - void *entry = NULL; - struct maple_enode *prev_node; - struct maple_node *node; - unsigned char offset; - unsigned long last; - enum maple_type mt; - - if (mas->index > limit) { - mas->index = mas->last = limit; - mas_pause(mas); + if (mas->last >= limit) return NULL; - } - last = mas->last; -retry: - offset = mas->offset; - prev_node = mas->node; - node = mas_mn(mas); - mt = mte_node_type(mas->node); - mas->offset++; - if (unlikely(mas->offset >= mt_slots[mt])) { - mas->offset = mt_slots[mt] - 1; - goto next_node; - } - while (!mas_is_none(mas)) { - entry = mas_next_nentry(mas, node, limit, mt); - if (unlikely(ma_dead_node(node))) { - mas_rewalk(mas, last); - goto retry; - } - - if (likely(entry)) - return entry; - - if (unlikely((mas->index > limit))) - break; - -next_node: - prev_node = mas->node; - offset = mas->offset; - if (unlikely(mas_next_node(mas, node, limit))) { - mas_rewalk(mas, last); - goto retry; - } - mas->offset = 0; - node = mas_mn(mas); - mt = mte_node_type(mas->node); - } - - mas->index = mas->last = limit; - mas->offset = offset; - mas->node = prev_node; - return NULL; -} - -/* - * mas_prev_nentry() - Get the previous node entry. - * @mas: The maple state. - * @limit: The lower limit to check for a value. - * - * Return: the entry, %NULL otherwise. - */ -static inline void *mas_prev_nentry(struct ma_state *mas, unsigned long limit, - unsigned long index) -{ - unsigned long pivot, min; - unsigned char offset; - struct maple_node *mn; - enum maple_type mt; - unsigned long *pivots; - void __rcu **slots; - void *entry; - -retry: - if (!mas->offset) - return NULL; - - mn = mas_mn(mas); - mt = mte_node_type(mas->node); - offset = mas->offset - 1; - if (offset >= mt_slots[mt]) - offset = mt_slots[mt] - 1; - - slots = ma_slots(mn, mt); - pivots = ma_pivots(mn, mt); - if (unlikely(ma_dead_node(mn))) { - mas_rewalk(mas, index); - goto retry; - } - - if (offset == mt_pivots[mt]) - pivot = mas->max; - else - pivot = pivots[offset]; - - if (unlikely(ma_dead_node(mn))) { - mas_rewalk(mas, index); - goto retry; - } - - while (offset && ((!mas_slot(mas, slots, offset) && pivot >= limit) || - !pivot)) - pivot = pivots[--offset]; - - min = mas_safe_min(mas, pivots, offset); - entry = mas_slot(mas, slots, offset); - if (unlikely(ma_dead_node(mn))) { - mas_rewalk(mas, index); - goto retry; - } - - if (likely(entry)) { - mas->offset = offset; - mas->last = pivot; - mas->index = min; - } - return entry; -} - -static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) -{ - void *entry; - - if (mas->index < min) { - mas->index = mas->last = min; - mas->node = MAS_NONE; - return NULL; - } -retry: - while (likely(!mas_is_none(mas))) { - entry = mas_prev_nentry(mas, min, mas->index); - if (unlikely(mas->last < min)) - goto not_found; - - if (likely(entry)) - return entry; - - if (unlikely(mas_prev_node(mas, min))) { - mas_rewalk(mas, mas->index); - goto retry; - } - - mas->offset++; - } - - mas->offset--; -not_found: - mas->index = mas->last = min; - return NULL; + return mas_next_slot(mas, limit, false); } /* @@ -5105,24 +5023,25 @@ void *mas_walk(struct ma_state *mas) { void *entry; + if (mas_is_none(mas) || mas_is_paused(mas) || mas_is_ptr(mas)) + mas->node = MAS_START; retry: entry = mas_state_walk(mas); - if (mas_is_start(mas)) + if (mas_is_start(mas)) { goto retry; - - if (mas_is_ptr(mas)) { + } else if (mas_is_none(mas)) { + mas->index = 0; + mas->last = ULONG_MAX; + } else if (mas_is_ptr(mas)) { if (!mas->index) { mas->last = 0; - } else { - mas->index = 1; - mas->last = ULONG_MAX; + return entry; } - return entry; - } - if (mas_is_none(mas)) { - mas->index = 0; + mas->index = 1; mas->last = ULONG_MAX; + mas->node = MAS_NONE; + return NULL; } return entry; @@ -5289,7 +5208,10 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long *pivots; enum maple_type mt; - if (min >= max) + if (min > max) + return -EINVAL; + + if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) @@ -5338,7 +5260,10 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, { struct maple_enode *last = mas->node; - if (min >= max) + if (min > max) + return -EINVAL; + + if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) { @@ -5374,7 +5299,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, return -EBUSY; /* Trim the upper limit to the max. */ - if (max <= mas->last) + if (max < mas->last) mas->last = max; mas->index = mas->last - size + 1; @@ -5394,8 +5319,8 @@ static inline int mas_alloc(struct ma_state *mas, void *entry, return xa_err(mas->node); if (!mas->index) - return mte_pivot(mas->node, 0); - return mte_pivot(mas->node, 1); + return mas_pivot(mas, 0); + return mas_pivot(mas, 1); } /* Must be walking a tree. */ @@ -5412,7 +5337,10 @@ static inline int mas_alloc(struct ma_state *mas, void *entry, */ min = mas->min; if (mas->offset) - min = mte_pivot(mas->node, mas->offset - 1) + 1; + min = mas_pivot(mas, mas->offset - 1) + 1; + + if (mas_is_err(mas)) + return xa_err(mas->node); if (mas->index < min) mas->index = min; @@ -5694,9 +5622,9 @@ void *mas_store(struct ma_state *mas, void *entry) trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE - if (mas->index > mas->last) - pr_err("Error %lu > %lu %p\n", mas->index, mas->last, entry); - MT_BUG_ON(mas->tree, mas->index > mas->last); + if (MAS_WARN_ON(mas, mas->index > mas->last)) + pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry); + if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); return NULL; @@ -5756,7 +5684,7 @@ void mas_store_prealloc(struct ma_state *mas, void *entry) mas_wr_store_setup(&wr_mas); trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); - BUG_ON(mas_is_err(mas)); + MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); } EXPORT_SYMBOL_GPL(mas_store_prealloc); @@ -5808,9 +5736,7 @@ void mas_destroy(struct ma_state *mas) if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; - if (mas_is_start(mas)) - mas_start(mas); - + mas_start(mas); mtree_range_walk(mas); end = mas_data_end(mas) + 1; if (end < mt_min_slot_count(mas->node) - 1) @@ -5900,6 +5826,34 @@ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) } EXPORT_SYMBOL_GPL(mas_expected_entries); +static inline bool mas_next_setup(struct ma_state *mas, unsigned long max, + void **entry) +{ + bool was_none = mas_is_none(mas); + + if (mas_is_none(mas) || mas_is_paused(mas)) + mas->node = MAS_START; + + if (mas_is_start(mas)) + *entry = mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + + if (mas_is_ptr(mas)) { + *entry = NULL; + if (was_none && mas->index == 0) { + mas->index = mas->last = 0; + return true; + } + mas->index = 1; + mas->last = ULONG_MAX; + mas->node = MAS_NONE; + return true; + } + + if (mas_is_none(mas)) + return true; + return false; +} + /** * mas_next() - Get the next entry. * @mas: The maple state @@ -5913,27 +5867,38 @@ EXPORT_SYMBOL_GPL(mas_expected_entries); */ void *mas_next(struct ma_state *mas, unsigned long max) { - if (mas_is_none(mas) || mas_is_paused(mas)) - mas->node = MAS_START; + void *entry = NULL; - if (mas_is_start(mas)) - mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + if (mas_next_setup(mas, max, &entry)) + return entry; - if (mas_is_ptr(mas)) { - if (!mas->index) { - mas->index = 1; - mas->last = ULONG_MAX; - } - return NULL; - } + /* Retries on dead nodes handled by mas_next_slot */ + return mas_next_slot(mas, max, false); +} +EXPORT_SYMBOL_GPL(mas_next); - if (mas->last == ULONG_MAX) - return NULL; +/** + * mas_next_range() - Advance the maple state to the next range + * @mas: The maple state + * @max: The maximum index to check. + * + * Sets @mas->index and @mas->last to the range. + * Must hold rcu_read_lock or the write lock. + * Can return the zero entry. + * + * Return: The next entry or %NULL + */ +void *mas_next_range(struct ma_state *mas, unsigned long max) +{ + void *entry = NULL; - /* Retries on dead nodes handled by mas_next_entry */ - return mas_next_entry(mas, max); + if (mas_next_setup(mas, max, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_next_slot */ + return mas_next_slot(mas, max, true); } -EXPORT_SYMBOL_GPL(mas_next); +EXPORT_SYMBOL_GPL(mas_next_range); /** * mt_next() - get the next value in the maple tree @@ -5955,6 +5920,47 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) } EXPORT_SYMBOL_GPL(mt_next); +static inline bool mas_prev_setup(struct ma_state *mas, unsigned long min, + void **entry) +{ + if (mas->index <= min) + goto none; + + if (mas_is_none(mas) || mas_is_paused(mas)) + mas->node = MAS_START; + + if (mas_is_start(mas)) { + mas_walk(mas); + if (!mas->index) + goto none; + } + + if (unlikely(mas_is_ptr(mas))) { + if (!mas->index) + goto none; + mas->index = mas->last = 0; + *entry = mas_root(mas); + return true; + } + + if (mas_is_none(mas)) { + if (mas->index) { + /* Walked to out-of-range pointer? */ + mas->index = mas->last = 0; + mas->node = MAS_ROOT; + *entry = mas_root(mas); + return true; + } + return true; + } + + return false; + +none: + mas->node = MAS_NONE; + return true; +} + /** * mas_prev() - Get the previous entry * @mas: The maple state @@ -5968,37 +5974,37 @@ EXPORT_SYMBOL_GPL(mt_next); */ void *mas_prev(struct ma_state *mas, unsigned long min) { - if (!mas->index) { - /* Nothing comes before 0 */ - mas->last = 0; - mas->node = MAS_NONE; - return NULL; - } + void *entry = NULL; - if (unlikely(mas_is_ptr(mas))) - return NULL; + if (mas_prev_setup(mas, min, &entry)) + return entry; - if (mas_is_none(mas) || mas_is_paused(mas)) - mas->node = MAS_START; + return mas_prev_slot(mas, min, false); +} +EXPORT_SYMBOL_GPL(mas_prev); - if (mas_is_start(mas)) { - mas_walk(mas); - if (!mas->index) - return NULL; - } +/** + * mas_prev_range() - Advance to the previous range + * @mas: The maple state + * @min: The minimum value to check. + * + * Sets @mas->index and @mas->last to the range. + * Must hold rcu_read_lock or the write lock. + * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * searchable nodes. + * + * Return: the previous value or %NULL. + */ +void *mas_prev_range(struct ma_state *mas, unsigned long min) +{ + void *entry = NULL; - if (mas_is_ptr(mas)) { - if (!mas->index) { - mas->last = 0; - return NULL; - } + if (mas_prev_setup(mas, min, &entry)) + return entry; - mas->index = mas->last = 0; - return mas_root_locked(mas); - } - return mas_prev_entry(mas, min); + return mas_prev_slot(mas, min, true); } -EXPORT_SYMBOL_GPL(mas_prev); +EXPORT_SYMBOL_GPL(mas_prev_range); /** * mt_prev() - get the previous value in the maple tree @@ -6040,6 +6046,64 @@ void mas_pause(struct ma_state *mas) EXPORT_SYMBOL_GPL(mas_pause); /** + * mas_find_setup() - Internal function to set up mas_find*(). + * @mas: The maple state + * @max: The maximum index + * @entry: Pointer to the entry + * + * Returns: True if entry is the answer, false otherwise. + */ +static inline bool mas_find_setup(struct ma_state *mas, unsigned long max, + void **entry) +{ + *entry = NULL; + + if (unlikely(mas_is_none(mas))) { + if (unlikely(mas->last >= max)) + return true; + + mas->index = mas->last; + mas->node = MAS_START; + } else if (unlikely(mas_is_paused(mas))) { + if (unlikely(mas->last >= max)) + return true; + + mas->node = MAS_START; + mas->index = ++mas->last; + } else if (unlikely(mas_is_ptr(mas))) + goto ptr_out_of_range; + + if (unlikely(mas_is_start(mas))) { + /* First run or continue */ + if (mas->index > max) + return true; + + *entry = mas_walk(mas); + if (*entry) + return true; + + } + + if (unlikely(!mas_searchable(mas))) { + if (unlikely(mas_is_ptr(mas))) + goto ptr_out_of_range; + + return true; + } + + if (mas->index == max) + return true; + + return false; + +ptr_out_of_range: + mas->node = MAS_NONE; + mas->index = 1; + mas->last = ULONG_MAX; + return true; +} + +/** * mas_find() - On the first call, find the entry at or after mas->index up to * %max. Otherwise, find the entry after mas->index. * @mas: The maple state @@ -6053,37 +6117,105 @@ EXPORT_SYMBOL_GPL(mas_pause); */ void *mas_find(struct ma_state *mas, unsigned long max) { + void *entry = NULL; + + if (mas_find_setup(mas, max, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_next_slot */ + return mas_next_slot(mas, max, false); +} +EXPORT_SYMBOL_GPL(mas_find); + +/** + * mas_find_range() - On the first call, find the entry at or after + * mas->index up to %max. Otherwise, advance to the next slot mas->index. + * @mas: The maple state + * @max: The maximum value to check. + * + * Must hold rcu_read_lock or the write lock. + * If an entry exists, last and index are updated accordingly. + * May set @mas->node to MAS_NONE. + * + * Return: The entry or %NULL. + */ +void *mas_find_range(struct ma_state *mas, unsigned long max) +{ + void *entry; + + if (mas_find_setup(mas, max, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_next_slot */ + return mas_next_slot(mas, max, true); +} +EXPORT_SYMBOL_GPL(mas_find_range); + +/** + * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() + * @mas: The maple state + * @min: The minimum index + * @entry: Pointer to the entry + * + * Returns: True if entry is the answer, false otherwise. + */ +static inline bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, + void **entry) +{ + *entry = NULL; + + if (unlikely(mas_is_none(mas))) { + if (mas->index <= min) + goto none; + + mas->last = mas->index; + mas->node = MAS_START; + } + if (unlikely(mas_is_paused(mas))) { - if (unlikely(mas->last == ULONG_MAX)) { + if (unlikely(mas->index <= min)) { mas->node = MAS_NONE; - return NULL; + return true; } mas->node = MAS_START; - mas->index = ++mas->last; + mas->last = --mas->index; } - if (unlikely(mas_is_none(mas))) - mas->node = MAS_START; - if (unlikely(mas_is_start(mas))) { /* First run or continue */ - void *entry; + if (mas->index < min) + return true; - if (mas->index > max) - return NULL; + *entry = mas_walk(mas); + if (*entry) + return true; + } - entry = mas_walk(mas); - if (entry) - return entry; + if (unlikely(!mas_searchable(mas))) { + if (mas_is_ptr(mas)) + goto none; + + if (mas_is_none(mas)) { + /* + * Walked to the location, and there was nothing so the + * previous location is 0. + */ + mas->last = mas->index = 0; + mas->node = MAS_ROOT; + *entry = mas_root(mas); + return true; + } } - if (unlikely(!mas_searchable(mas))) - return NULL; + if (mas->index < min) + return true; - /* Retries on dead nodes handled by mas_next_entry */ - return mas_next_entry(mas, max); + return false; + +none: + mas->node = MAS_NONE; + return true; } -EXPORT_SYMBOL_GPL(mas_find); /** * mas_find_rev: On the first call, find the first non-null entry at or below @@ -6100,37 +6232,41 @@ EXPORT_SYMBOL_GPL(mas_find); */ void *mas_find_rev(struct ma_state *mas, unsigned long min) { - if (unlikely(mas_is_paused(mas))) { - if (unlikely(mas->last == ULONG_MAX)) { - mas->node = MAS_NONE; - return NULL; - } - mas->node = MAS_START; - mas->last = --mas->index; - } + void *entry; - if (unlikely(mas_is_start(mas))) { - /* First run or continue */ - void *entry; + if (mas_find_rev_setup(mas, min, &entry)) + return entry; - if (mas->index < min) - return NULL; + /* Retries on dead nodes handled by mas_prev_slot */ + return mas_prev_slot(mas, min, false); - entry = mas_walk(mas); - if (entry) - return entry; - } +} +EXPORT_SYMBOL_GPL(mas_find_rev); - if (unlikely(!mas_searchable(mas))) - return NULL; +/** + * mas_find_range_rev: On the first call, find the first non-null entry at or + * below mas->index down to %min. Otherwise advance to the previous slot after + * mas->index down to %min. + * @mas: The maple state + * @min: The minimum value to check. + * + * Must hold rcu_read_lock or the write lock. + * If an entry exists, last and index are updated accordingly. + * May set @mas->node to MAS_NONE. + * + * Return: The entry or %NULL. + */ +void *mas_find_range_rev(struct ma_state *mas, unsigned long min) +{ + void *entry; - if (mas->index < min) - return NULL; + if (mas_find_rev_setup(mas, min, &entry)) + return entry; - /* Retries on dead nodes handled by mas_prev_entry */ - return mas_prev_entry(mas, min); + /* Retries on dead nodes handled by mas_prev_slot */ + return mas_prev_slot(mas, min, true); } -EXPORT_SYMBOL_GPL(mas_find_rev); +EXPORT_SYMBOL_GPL(mas_find_range_rev); /** * mas_erase() - Find the range in which index resides and erase the entire @@ -6357,7 +6493,7 @@ int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, { int ret = 0; - MA_STATE(mas, mt, min, max - size); + MA_STATE(mas, mt, min, min); if (!mt_is_alloc(mt)) return -EINVAL; @@ -6377,7 +6513,7 @@ int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, retry: mas.offset = 0; mas.index = min; - mas.last = max - size; + mas.last = max - size + 1; ret = mas_alloc(&mas, entry, size, startp); if (mas_nomem(&mas, gfp)) goto retry; @@ -6393,14 +6529,14 @@ int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, { int ret = 0; - MA_STATE(mas, mt, min, max - size); + MA_STATE(mas, mt, min, max - size + 1); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; - if (min >= max) + if (min > max) return -EINVAL; if (max < size - 1) @@ -6512,7 +6648,7 @@ retry: if (entry) goto unlock; - while (mas_searchable(&mas) && (mas.index < max)) { + while (mas_searchable(&mas) && (mas.last < max)) { entry = mas_next_entry(&mas, max); if (likely(entry && !xa_is_zero(entry))) break; @@ -6525,10 +6661,9 @@ unlock: if (likely(entry)) { *index = mas.last + 1; #ifdef CONFIG_DEBUG_MAPLE_TREE - if ((*index) && (*index) <= copy) + if (MT_WARN_ON(mt, (*index) && ((*index) <= copy))) pr_err("index not increased! %lx <= %lx\n", *index, copy); - MT_BUG_ON(mt, (*index) && ((*index) <= copy)); #endif } @@ -6674,7 +6809,7 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, max = mas->max; mas->offset = 0; while (likely(!ma_is_leaf(mt))) { - MT_BUG_ON(mas->tree, mte_dead_node(mas->node)); + MAS_WARN_ON(mas, mte_dead_node(mas->node)); slots = ma_slots(mn, mt); entry = mas_slot(mas, slots, 0); pivots = ma_pivots(mn, mt); @@ -6685,7 +6820,7 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, mn = mas_mn(mas); mt = mte_node_type(mas->node); } - MT_BUG_ON(mas->tree, mte_dead_node(mas->node)); + MAS_WARN_ON(mas, mte_dead_node(mas->node)); mas->max = max; slots = ma_slots(mn, mt); @@ -6735,15 +6870,12 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) mas->node = mn; mas_ascend(mas); - while (mas->node != MAS_NONE) { + do { p = mas->node; p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); - } - - if (p == MAS_NONE) - return; + } while (!mas_is_none(mas)); mas->node = p; mas->max = p_max; @@ -6752,22 +6884,33 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, - unsigned long min, unsigned long max, unsigned int depth); + unsigned long min, unsigned long max, unsigned int depth, + enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, - unsigned int depth) + unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; - if (min == max) - pr_info("%.*s%lu: ", depth * 2, spaces, min); - else - pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); + switch(format) { + case mt_dump_hex: + if (min == max) + pr_info("%.*s%lx: ", depth * 2, spaces, min); + else + pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); + break; + default: + case mt_dump_dec: + if (min == max) + pr_info("%.*s%lu: ", depth * 2, spaces, min); + else + pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); + } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, - unsigned int depth) + unsigned int depth, enum mt_dump_format format) { - mt_dump_range(min, max, depth); + mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry), @@ -6781,7 +6924,8 @@ static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, } static void mt_dump_range64(const struct maple_tree *mt, void *entry, - unsigned long min, unsigned long max, unsigned int depth) + unsigned long min, unsigned long max, unsigned int depth, + enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); @@ -6789,8 +6933,16 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, int i; pr_cont(" contents: "); - for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { + switch(format) { + case mt_dump_hex: + pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + break; + default: + case mt_dump_dec: + pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + } + } pr_cont("%p\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; @@ -6803,24 +6955,32 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), - first, last, depth + 1); + first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), - first, last, depth + 1); + first, last, depth + 1, format); if (last == max) break; if (last > max) { - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + switch(format) { + case mt_dump_hex: + pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); - break; + break; + default: + case mt_dump_dec: + pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + node, last, max, i); + } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, - unsigned long min, unsigned long max, unsigned int depth) + unsigned long min, unsigned long max, unsigned int depth, + enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; bool leaf = mte_is_leaf(entry); @@ -6845,10 +7005,10 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), - first, last, depth + 1); + first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), - first, last, depth + 1); + first, last, depth + 1, format); if (last == max) break; @@ -6862,13 +7022,14 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, } static void mt_dump_node(const struct maple_tree *mt, void *entry, - unsigned long min, unsigned long max, unsigned int depth) + unsigned long min, unsigned long max, unsigned int depth, + enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; - mt_dump_range(min, max, depth); + mt_dump_range(min, max, depth, format); pr_cont("node %p depth %d type %d parent %p", node, depth, type, node ? node->parent : NULL); @@ -6879,15 +7040,15 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry, if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), - min + i, min + i, depth); + min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: - mt_dump_range64(mt, entry, min, max, depth); + mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: - mt_dump_arange64(mt, entry, min, max, depth); + mt_dump_arange64(mt, entry, min, max, depth, format); break; default: @@ -6895,16 +7056,16 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry, } } -void mt_dump(const struct maple_tree *mt) +void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(%p) flags %X, height %u root %p\n", mt, mt->ma_flags, mt_height(mt), entry); if (!xa_is_node(entry)) - mt_dump_entry(entry, 0, 0, 0); + mt_dump_entry(entry, 0, 0, 0, format); else if (entry) - mt_dump_node(mt, entry, 0, mt_node_max(entry), 0); + mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); } EXPORT_SYMBOL_GPL(mt_dump); @@ -6957,7 +7118,7 @@ static void mas_validate_gaps(struct ma_state *mas) mas_mn(mas), i, mas_get_slot(mas, i), gap, p_end, p_start); - mt_dump(mas->tree); + mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, gap != p_end - p_start + 1); @@ -6988,27 +7149,29 @@ counted: p_slot = mte_parent_slot(mas->node); p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); - if (ma_gaps(p_mn, mas_parent_enum(mas, mte))[p_slot] != max_gap) { + if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); - mt_dump(mas->tree); + mt_dump(mas->tree, mt_dump_hex); } MT_BUG_ON(mas->tree, - ma_gaps(p_mn, mas_parent_enum(mas, mte))[p_slot] != max_gap); + ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap); } static void mas_validate_parent_slot(struct ma_state *mas) { struct maple_node *parent; struct maple_enode *node; - enum maple_type p_type = mas_parent_enum(mas, mas->node); - unsigned char p_slot = mte_parent_slot(mas->node); + enum maple_type p_type; + unsigned char p_slot; void __rcu **slots; int i; if (mte_is_root(mas->node)) return; + p_slot = mte_parent_slot(mas->node); + p_type = mas_parent_type(mas, mas->node); parent = mte_parent(mas->node); slots = ma_slots(parent, p_type); MT_BUG_ON(mas->tree, mas_mn(mas) == parent); @@ -7101,18 +7264,18 @@ static void mas_validate_limits(struct ma_state *mas) if (prev_piv > piv) { pr_err("%p[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); - MT_BUG_ON(mas->tree, piv < prev_piv); + MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); - MT_BUG_ON(mas->tree, piv < mas->min); + MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); - MT_BUG_ON(mas->tree, piv > mas->max); + MAS_WARN_ON(mas, piv > mas->max); } prev_piv = piv; if (piv == mas->max) @@ -7135,7 +7298,7 @@ static void mas_validate_limits(struct ma_state *mas) pr_err("%p[%u] should not have piv %lu\n", mas_mn(mas), i, piv); - MT_BUG_ON(mas->tree, i < mt_pivots[type] - 1); + MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } } } @@ -7194,16 +7357,15 @@ void mt_validate(struct maple_tree *mt) mas_first_entry(&mas, mas_mn(&mas), ULONG_MAX, mte_node_type(mas.node)); while (!mas_is_none(&mas)) { - MT_BUG_ON(mas.tree, mte_dead_node(mas.node)); + MAS_WARN_ON(&mas, mte_dead_node(mas.node)); if (!mte_is_root(mas.node)) { end = mas_data_end(&mas); - if ((end < mt_min_slot_count(mas.node)) && - (mas.max != ULONG_MAX)) { + if (MAS_WARN_ON(&mas, + (end < mt_min_slot_count(mas.node)) && + (mas.max != ULONG_MAX))) { pr_err("Invalid size %u of %p\n", end, - mas_mn(&mas)); - MT_BUG_ON(mas.tree, 1); + mas_mn(&mas)); } - } mas_validate_parent_slot(&mas); mas_validate_child_slot(&mas); @@ -7219,4 +7381,34 @@ done: } EXPORT_SYMBOL_GPL(mt_validate); +void mas_dump(const struct ma_state *mas) +{ + pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node); + if (mas_is_none(mas)) + pr_err("(MAS_NONE) "); + else if (mas_is_ptr(mas)) + pr_err("(MAS_ROOT) "); + else if (mas_is_start(mas)) + pr_err("(MAS_START) "); + else if (mas_is_paused(mas)) + pr_err("(MAS_PAUSED) "); + + pr_err("[%u] index=%lx last=%lx\n", mas->offset, mas->index, mas->last); + pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", + mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); + if (mas->index > mas->last) + pr_err("Check index & last\n"); +} +EXPORT_SYMBOL_GPL(mas_dump); + +void mas_wr_dump(const struct ma_wr_state *wr_mas) +{ + pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n", + wr_mas->node, wr_mas->r_min, wr_mas->r_max); + pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", + wr_mas->type, wr_mas->offset_end, wr_mas->node_end, + wr_mas->end_piv); +} +EXPORT_SYMBOL_GPL(mas_wr_dump); + #endif /* CONFIG_DEBUG_MAPLE_TREE */ diff --git a/lib/show_mem.c b/lib/show_mem.c deleted file mode 100644 index 1485c87be935..000000000000 --- a/lib/show_mem.c +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Generic show_mem() implementation - * - * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de> - */ - -#include <linux/mm.h> -#include <linux/cma.h> - -void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) -{ - unsigned long total = 0, reserved = 0, highmem = 0; - struct zone *zone; - - printk("Mem-Info:\n"); - __show_free_areas(filter, nodemask, max_zone_idx); - - for_each_populated_zone(zone) { - - total += zone->present_pages; - reserved += zone->present_pages - zone_managed_pages(zone); - - if (is_highmem(zone)) - highmem += zone->present_pages; - } - - printk("%lu pages RAM\n", total); - printk("%lu pages HighMem/MovableOnly\n", highmem); - printk("%lu pages reserved\n", reserved); -#ifdef CONFIG_CMA - printk("%lu pages cma reserved\n", totalcma_pages); -#endif -#ifdef CONFIG_MEMORY_FAILURE - printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); -#endif -} diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 2f5aa851834e..7053221ce1d8 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -29,6 +29,7 @@ #include <linux/types.h> #include <linux/memblock.h> #include <linux/kasan-enabled.h> +#include <linux/seq_file.h> #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) @@ -60,6 +61,7 @@ struct stack_record { u32 hash; /* Hash in the hash table */ u32 size; /* Number of stored frames */ union handle_parts handle; + refcount_t count; /* Number of the same repeated stacks */ unsigned long entries[]; /* Variable-sized array of frames */ }; @@ -305,6 +307,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; stack->handle.valid = 1; stack->handle.extra = 0; + refcount_set(&stack->count, 1); memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); pool_offset += required_size; /* @@ -457,8 +460,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, } EXPORT_SYMBOL_GPL(stack_depot_save); -unsigned int stack_depot_fetch(depot_stack_handle_t handle, - unsigned long **entries) +static struct stack_record *stack_depot_getstack(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; /* @@ -470,6 +472,100 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; + if (!handle) + return NULL; + + if (parts.pool_index > pool_index_cached) { + WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", + parts.pool_index, pool_index_cached, handle); + return NULL; + } + pool = stack_pools[parts.pool_index]; + if (!pool) + return NULL; + stack = pool + offset; + return stack; +} + +#ifdef CONFIG_PAGE_OWNER + +extern unsigned long page_owner_stack_threshold; + +void *stack_start(struct seq_file *m, loff_t *ppos) +{ + unsigned long *table = m->private; + struct stack_record **stacks, *stack; + + /* First time */ + if (*ppos == 0) + *table = 0; + + if (*ppos == -1UL) + return NULL; + + stacks = &stack_table[*table]; + stack = (struct stack_record *)stacks; + + return stack; +} + +void *stack_next(struct seq_file *m, void *v, loff_t *ppos) +{ + unsigned long *table = m->private; + unsigned long nr_table = *table; + struct stack_record *next = NULL, *stack = v, **stacks; + unsigned long stack_table_entries = stack_hash_mask + 1; + + if (!stack) { +new_table: + /* New table */ + nr_table++; + if (nr_table >= stack_table_entries) + goto out; + stacks = &stack_table[nr_table]; + stack = (struct stack_record *)stacks; + next = stack; + } else { + next = stack->next; + } + + if (!next) + goto new_table; + +out: + *table = nr_table; + *ppos = (nr_table >= stack_table_entries) ? -1UL : *ppos + 1; + return next; +} + +int stack_print(struct seq_file *m, void *v) +{ + char *buf; + int ret = 0; + struct stack_record *stack = v; + + if (!stack->size || stack->size < 0 || + stack->size > PAGE_SIZE || stack->handle.valid != 1 || + refcount_read(&stack->count) < page_owner_stack_threshold) + return 0; + + buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + ret += stack_trace_snprint(buf, PAGE_SIZE, stack->entries, stack->size, 0); + scnprintf(buf + ret, PAGE_SIZE - ret, "stack count: %d\n\n", + refcount_read(&stack->count)); + seq_printf(m, buf); + seq_puts(m, "\n\n"); + kfree(buf); + + return 0; +} +#endif + +unsigned int stack_depot_fetch(depot_stack_handle_t handle, + unsigned long **entries) +{ + struct stack_record *stack; + *entries = NULL; /* * Let KMSAN know *entries is initialized. This shall prevent false @@ -480,21 +576,33 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle) return 0; - if (parts.pool_index > pool_index_cached) { - WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", - parts.pool_index, pool_index_cached, handle); - return 0; - } - pool = stack_pools[parts.pool_index]; - if (!pool) + stack = stack_depot_getstack(handle); + if (!stack) return 0; - stack = pool + offset; *entries = stack->entries; return stack->size; } EXPORT_SYMBOL_GPL(stack_depot_fetch); +void stack_depot_inc_count(depot_stack_handle_t handle) +{ + struct stack_record *stack = NULL; + + stack = stack_depot_getstack(handle); + if (stack) + refcount_inc(&stack->count); +} + +void stack_depot_dec_count(depot_stack_handle_t handle) +{ + struct stack_record *stack = NULL; + + stack = stack_depot_getstack(handle); + if (stack) + refcount_dec(&stack->count); +} + void stack_depot_print(depot_stack_handle_t stack) { unsigned long *entries; diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index f1db333270e9..9939be34e516 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -11,12 +11,33 @@ #include <linux/module.h> #define MTREE_ALLOC_MAX 0x2000000000000Ul -#ifndef CONFIG_DEBUG_MAPLE_TREE -#define CONFIG_DEBUG_MAPLE_TREE -#endif #define CONFIG_MAPLE_SEARCH #define MAPLE_32BIT (MAPLE_NODE_SLOTS > 31) +#ifndef CONFIG_DEBUG_MAPLE_TREE +#define mt_dump(mt, fmt) do {} while (0) +#define mt_validate(mt) do {} while (0) +#define mt_cache_shrink() do {} while (0) +#define mas_dump(mas) do {} while (0) +#define mas_wr_dump(mas) do {} while (0) +atomic_t maple_tree_tests_run; +atomic_t maple_tree_tests_passed; +#undef MT_BUG_ON + +#define MT_BUG_ON(__tree, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) +#endif + /* #define BENCH_SLOT_STORE */ /* #define BENCH_NODE_STORE */ /* #define BENCH_AWALK */ @@ -30,54 +51,54 @@ #else #define cond_resched() do {} while (0) #endif -static -int mtree_insert_index(struct maple_tree *mt, unsigned long index, gfp_t gfp) +static int __init mtree_insert_index(struct maple_tree *mt, + unsigned long index, gfp_t gfp) { return mtree_insert(mt, index, xa_mk_value(index & LONG_MAX), gfp); } -static void mtree_erase_index(struct maple_tree *mt, unsigned long index) +static void __init mtree_erase_index(struct maple_tree *mt, unsigned long index) { MT_BUG_ON(mt, mtree_erase(mt, index) != xa_mk_value(index & LONG_MAX)); MT_BUG_ON(mt, mtree_load(mt, index) != NULL); } -static int mtree_test_insert(struct maple_tree *mt, unsigned long index, +static int __init mtree_test_insert(struct maple_tree *mt, unsigned long index, void *ptr) { return mtree_insert(mt, index, ptr, GFP_KERNEL); } -static int mtree_test_store_range(struct maple_tree *mt, unsigned long start, - unsigned long end, void *ptr) +static int __init mtree_test_store_range(struct maple_tree *mt, + unsigned long start, unsigned long end, void *ptr) { return mtree_store_range(mt, start, end, ptr, GFP_KERNEL); } -static int mtree_test_store(struct maple_tree *mt, unsigned long start, +static int __init mtree_test_store(struct maple_tree *mt, unsigned long start, void *ptr) { return mtree_test_store_range(mt, start, start, ptr); } -static int mtree_test_insert_range(struct maple_tree *mt, unsigned long start, - unsigned long end, void *ptr) +static int __init mtree_test_insert_range(struct maple_tree *mt, + unsigned long start, unsigned long end, void *ptr) { return mtree_insert_range(mt, start, end, ptr, GFP_KERNEL); } -static void *mtree_test_load(struct maple_tree *mt, unsigned long index) +static void __init *mtree_test_load(struct maple_tree *mt, unsigned long index) { return mtree_load(mt, index); } -static void *mtree_test_erase(struct maple_tree *mt, unsigned long index) +static void __init *mtree_test_erase(struct maple_tree *mt, unsigned long index) { return mtree_erase(mt, index); } #if defined(CONFIG_64BIT) -static noinline void check_mtree_alloc_range(struct maple_tree *mt, +static noinline void __init check_mtree_alloc_range(struct maple_tree *mt, unsigned long start, unsigned long end, unsigned long size, unsigned long expected, int eret, void *ptr) { @@ -94,7 +115,7 @@ static noinline void check_mtree_alloc_range(struct maple_tree *mt, MT_BUG_ON(mt, result != expected); } -static noinline void check_mtree_alloc_rrange(struct maple_tree *mt, +static noinline void __init check_mtree_alloc_rrange(struct maple_tree *mt, unsigned long start, unsigned long end, unsigned long size, unsigned long expected, int eret, void *ptr) { @@ -102,7 +123,7 @@ static noinline void check_mtree_alloc_rrange(struct maple_tree *mt, unsigned long result = expected + 1; int ret; - ret = mtree_alloc_rrange(mt, &result, ptr, size, start, end - 1, + ret = mtree_alloc_rrange(mt, &result, ptr, size, start, end, GFP_KERNEL); MT_BUG_ON(mt, ret != eret); if (ret) @@ -112,8 +133,8 @@ static noinline void check_mtree_alloc_rrange(struct maple_tree *mt, } #endif -static noinline void check_load(struct maple_tree *mt, unsigned long index, - void *ptr) +static noinline void __init check_load(struct maple_tree *mt, + unsigned long index, void *ptr) { void *ret = mtree_test_load(mt, index); @@ -122,7 +143,7 @@ static noinline void check_load(struct maple_tree *mt, unsigned long index, MT_BUG_ON(mt, ret != ptr); } -static noinline void check_store_range(struct maple_tree *mt, +static noinline void __init check_store_range(struct maple_tree *mt, unsigned long start, unsigned long end, void *ptr, int expected) { int ret = -EINVAL; @@ -138,7 +159,7 @@ static noinline void check_store_range(struct maple_tree *mt, check_load(mt, i, ptr); } -static noinline void check_insert_range(struct maple_tree *mt, +static noinline void __init check_insert_range(struct maple_tree *mt, unsigned long start, unsigned long end, void *ptr, int expected) { int ret = -EINVAL; @@ -154,8 +175,8 @@ static noinline void check_insert_range(struct maple_tree *mt, check_load(mt, i, ptr); } -static noinline void check_insert(struct maple_tree *mt, unsigned long index, - void *ptr) +static noinline void __init check_insert(struct maple_tree *mt, + unsigned long index, void *ptr) { int ret = -EINVAL; @@ -163,7 +184,7 @@ static noinline void check_insert(struct maple_tree *mt, unsigned long index, MT_BUG_ON(mt, ret != 0); } -static noinline void check_dup_insert(struct maple_tree *mt, +static noinline void __init check_dup_insert(struct maple_tree *mt, unsigned long index, void *ptr) { int ret = -EINVAL; @@ -173,13 +194,13 @@ static noinline void check_dup_insert(struct maple_tree *mt, } -static noinline -void check_index_load(struct maple_tree *mt, unsigned long index) +static noinline void __init check_index_load(struct maple_tree *mt, + unsigned long index) { return check_load(mt, index, xa_mk_value(index & LONG_MAX)); } -static inline int not_empty(struct maple_node *node) +static inline __init int not_empty(struct maple_node *node) { int i; @@ -194,8 +215,8 @@ static inline int not_empty(struct maple_node *node) } -static noinline void check_rev_seq(struct maple_tree *mt, unsigned long max, - bool verbose) +static noinline void __init check_rev_seq(struct maple_tree *mt, + unsigned long max, bool verbose) { unsigned long i = max, j; @@ -219,7 +240,7 @@ static noinline void check_rev_seq(struct maple_tree *mt, unsigned long max, #ifndef __KERNEL__ if (verbose) { rcu_barrier(); - mt_dump(mt); + mt_dump(mt, mt_dump_dec); pr_info(" %s test of 0-%lu %luK in %d active (%d total)\n", __func__, max, mt_get_alloc_size()/1024, mt_nr_allocated(), mt_nr_tallocated()); @@ -227,7 +248,7 @@ static noinline void check_rev_seq(struct maple_tree *mt, unsigned long max, #endif } -static noinline void check_seq(struct maple_tree *mt, unsigned long max, +static noinline void __init check_seq(struct maple_tree *mt, unsigned long max, bool verbose) { unsigned long i, j; @@ -248,7 +269,7 @@ static noinline void check_seq(struct maple_tree *mt, unsigned long max, #ifndef __KERNEL__ if (verbose) { rcu_barrier(); - mt_dump(mt); + mt_dump(mt, mt_dump_dec); pr_info(" seq test of 0-%lu %luK in %d active (%d total)\n", max, mt_get_alloc_size()/1024, mt_nr_allocated(), mt_nr_tallocated()); @@ -256,7 +277,7 @@ static noinline void check_seq(struct maple_tree *mt, unsigned long max, #endif } -static noinline void check_lb_not_empty(struct maple_tree *mt) +static noinline void __init check_lb_not_empty(struct maple_tree *mt) { unsigned long i, j; unsigned long huge = 4000UL * 1000 * 1000; @@ -275,13 +296,13 @@ static noinline void check_lb_not_empty(struct maple_tree *mt) mtree_destroy(mt); } -static noinline void check_lower_bound_split(struct maple_tree *mt) +static noinline void __init check_lower_bound_split(struct maple_tree *mt) { MT_BUG_ON(mt, !mtree_empty(mt)); check_lb_not_empty(mt); } -static noinline void check_upper_bound_split(struct maple_tree *mt) +static noinline void __init check_upper_bound_split(struct maple_tree *mt) { unsigned long i, j; unsigned long huge; @@ -306,7 +327,7 @@ static noinline void check_upper_bound_split(struct maple_tree *mt) mtree_destroy(mt); } -static noinline void check_mid_split(struct maple_tree *mt) +static noinline void __init check_mid_split(struct maple_tree *mt) { unsigned long huge = 8000UL * 1000 * 1000; @@ -315,7 +336,7 @@ static noinline void check_mid_split(struct maple_tree *mt) check_lb_not_empty(mt); } -static noinline void check_rev_find(struct maple_tree *mt) +static noinline void __init check_rev_find(struct maple_tree *mt) { int i, nr_entries = 200; void *val; @@ -354,7 +375,7 @@ static noinline void check_rev_find(struct maple_tree *mt) rcu_read_unlock(); } -static noinline void check_find(struct maple_tree *mt) +static noinline void __init check_find(struct maple_tree *mt) { unsigned long val = 0; unsigned long count; @@ -571,7 +592,7 @@ static noinline void check_find(struct maple_tree *mt) mtree_destroy(mt); } -static noinline void check_find_2(struct maple_tree *mt) +static noinline void __init check_find_2(struct maple_tree *mt) { unsigned long i, j; void *entry; @@ -616,7 +637,7 @@ static noinline void check_find_2(struct maple_tree *mt) #if defined(CONFIG_64BIT) -static noinline void check_alloc_rev_range(struct maple_tree *mt) +static noinline void __init check_alloc_rev_range(struct maple_tree *mt) { /* * Generated by: @@ -624,7 +645,7 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt) * awk -F "-" '{printf "0x%s, 0x%s, ", $1, $2}' */ - unsigned long range[] = { + static const unsigned long range[] = { /* Inclusive , Exclusive. */ 0x565234af2000, 0x565234af4000, 0x565234af4000, 0x565234af9000, @@ -652,7 +673,7 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt) 0x7fff58791000, 0x7fff58793000, }; - unsigned long holes[] = { + static const unsigned long holes[] = { /* * Note: start of hole is INCLUSIVE * end of hole is EXCLUSIVE @@ -672,7 +693,7 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt) * 4. number that should be returned. * 5. return value */ - unsigned long req_range[] = { + static const unsigned long req_range[] = { 0x565234af9000, /* Min */ 0x7fff58791000, /* Max */ 0x1000, /* Size */ @@ -680,7 +701,7 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt) 0, /* Return value success. */ 0x0, /* Min */ - 0x565234AF1 << 12, /* Max */ + 0x565234AF0 << 12, /* Max */ 0x3000, /* Size */ 0x565234AEE << 12, /* max - 3. */ 0, /* Return value success. */ @@ -692,14 +713,14 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt) 0, /* Return value success. */ 0x0, /* Min */ - 0x7F36D510A << 12, /* Max */ + 0x7F36D5109 << 12, /* Max */ 0x4000, /* Size */ 0x7F36D5106 << 12, /* First rev hole of size 0x4000 */ 0, /* Return value success. */ /* Ascend test. */ 0x0, - 34148798629 << 12, + 34148798628 << 12, 19 << 12, 34148797418 << 12, 0x0, @@ -711,6 +732,12 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt) 0x0, -EBUSY, + /* Single space test. */ + 34148798725 << 12, + 34148798725 << 12, + 1 << 12, + 34148798725 << 12, + 0, }; int i, range_count = ARRAY_SIZE(range); @@ -759,9 +786,9 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt) mas_unlock(&mas); for (i = 0; i < req_range_count; i += 5) { #if DEBUG_REV_RANGE - pr_debug("\tReverse request between %lu-%lu size %lu, should get %lu\n", - req_range[i] >> 12, - (req_range[i + 1] >> 12) - 1, + pr_debug("\tReverse request %d between %lu-%lu size %lu, should get %lu\n", + i, req_range[i] >> 12, + (req_range[i + 1] >> 12), req_range[i+2] >> 12, req_range[i+3] >> 12); #endif @@ -777,13 +804,14 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt) mt_set_non_kernel(1); mtree_erase(mt, 34148798727); /* create a deleted range. */ + mtree_erase(mt, 34148798725); check_mtree_alloc_rrange(mt, 0, 34359052173, 210253414, 34148798725, 0, mt); mtree_destroy(mt); } -static noinline void check_alloc_range(struct maple_tree *mt) +static noinline void __init check_alloc_range(struct maple_tree *mt) { /* * Generated by: @@ -791,7 +819,7 @@ static noinline void check_alloc_range(struct maple_tree *mt) * awk -F "-" '{printf "0x%s, 0x%s, ", $1, $2}' */ - unsigned long range[] = { + static const unsigned long range[] = { /* Inclusive , Exclusive. */ 0x565234af2000, 0x565234af4000, 0x565234af4000, 0x565234af9000, @@ -818,7 +846,7 @@ static noinline void check_alloc_range(struct maple_tree *mt) 0x7fff5878e000, 0x7fff58791000, 0x7fff58791000, 0x7fff58793000, }; - unsigned long holes[] = { + static const unsigned long holes[] = { /* Start of hole, end of hole, size of hole (+1) */ 0x565234afb000, 0x565234afc000, 0x1000, 0x565234afe000, 0x565235def000, 0x12F1000, @@ -833,7 +861,7 @@ static noinline void check_alloc_range(struct maple_tree *mt) * 4. number that should be returned. * 5. return value */ - unsigned long req_range[] = { + static const unsigned long req_range[] = { 0x565234af9000, /* Min */ 0x7fff58791000, /* Max */ 0x1000, /* Size */ @@ -880,6 +908,13 @@ static noinline void check_alloc_range(struct maple_tree *mt) 4503599618982063UL << 12, /* Size */ 34359052178 << 12, /* Expected location */ -EBUSY, /* Return failure. */ + + /* Test a single entry */ + 34148798648 << 12, /* Min */ + 34148798648 << 12, /* Max */ + 4096, /* Size of 1 */ + 34148798648 << 12, /* Location is the same as min/max */ + 0, /* Success */ }; int i, range_count = ARRAY_SIZE(range); int req_range_count = ARRAY_SIZE(req_range); @@ -893,7 +928,7 @@ static noinline void check_alloc_range(struct maple_tree *mt) #if DEBUG_ALLOC_RANGE pr_debug("\tInsert %lu-%lu\n", range[i] >> 12, (range[i + 1] >> 12) - 1); - mt_dump(mt); + mt_dump(mt, mt_dump_hex); #endif check_insert_range(mt, range[i] >> 12, (range[i + 1] >> 12) - 1, xa_mk_value(range[i] >> 12), 0); @@ -934,7 +969,7 @@ static noinline void check_alloc_range(struct maple_tree *mt) xa_mk_value(req_range[i] >> 12)); /* pointer */ mt_validate(mt); #if DEBUG_ALLOC_RANGE - mt_dump(mt); + mt_dump(mt, mt_dump_hex); #endif } @@ -942,10 +977,10 @@ static noinline void check_alloc_range(struct maple_tree *mt) } #endif -static noinline void check_ranges(struct maple_tree *mt) +static noinline void __init check_ranges(struct maple_tree *mt) { int i, val, val2; - unsigned long r[] = { + static const unsigned long r[] = { 10, 15, 20, 25, 17, 22, /* Overlaps previous range. */ @@ -1210,7 +1245,7 @@ static noinline void check_ranges(struct maple_tree *mt) MT_BUG_ON(mt, mt_height(mt) != 4); } -static noinline void check_next_entry(struct maple_tree *mt) +static noinline void __init check_next_entry(struct maple_tree *mt) { void *entry = NULL; unsigned long limit = 30, i = 0; @@ -1234,7 +1269,7 @@ static noinline void check_next_entry(struct maple_tree *mt) mtree_destroy(mt); } -static noinline void check_prev_entry(struct maple_tree *mt) +static noinline void __init check_prev_entry(struct maple_tree *mt) { unsigned long index = 16; void *value; @@ -1278,7 +1313,7 @@ static noinline void check_prev_entry(struct maple_tree *mt) mas_unlock(&mas); } -static noinline void check_root_expand(struct maple_tree *mt) +static noinline void __init check_root_expand(struct maple_tree *mt) { MA_STATE(mas, mt, 0, 0); void *ptr; @@ -1287,6 +1322,7 @@ static noinline void check_root_expand(struct maple_tree *mt) mas_lock(&mas); mas_set(&mas, 3); ptr = mas_walk(&mas); + MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, ptr != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != ULONG_MAX); @@ -1356,7 +1392,7 @@ static noinline void check_root_expand(struct maple_tree *mt) mas_store_gfp(&mas, ptr, GFP_KERNEL); ptr = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, ptr != NULL); - MT_BUG_ON(mt, (mas.index != 1) && (mas.last != ULONG_MAX)); + MT_BUG_ON(mt, (mas.index != ULONG_MAX) && (mas.last != ULONG_MAX)); mas_set(&mas, 1); ptr = mas_prev(&mas, 0); @@ -1367,13 +1403,13 @@ static noinline void check_root_expand(struct maple_tree *mt) mas_unlock(&mas); } -static noinline void check_gap_combining(struct maple_tree *mt) +static noinline void __init check_gap_combining(struct maple_tree *mt) { struct maple_enode *mn1, *mn2; void *entry; unsigned long singletons = 100; - unsigned long *seq100; - unsigned long seq100_64[] = { + static const unsigned long *seq100; + static const unsigned long seq100_64[] = { /* 0-5 */ 74, 75, 76, 50, 100, 2, @@ -1387,7 +1423,7 @@ static noinline void check_gap_combining(struct maple_tree *mt) 76, 2, 79, 85, 4, }; - unsigned long seq100_32[] = { + static const unsigned long seq100_32[] = { /* 0-5 */ 61, 62, 63, 50, 100, 2, @@ -1401,11 +1437,11 @@ static noinline void check_gap_combining(struct maple_tree *mt) 76, 2, 79, 85, 4, }; - unsigned long seq2000[] = { + static const unsigned long seq2000[] = { 1152, 1151, 1100, 1200, 2, }; - unsigned long seq400[] = { + static const unsigned long seq400[] = { 286, 318, 256, 260, 266, 270, 275, 280, 290, 398, 286, 310, @@ -1564,7 +1600,7 @@ static noinline void check_gap_combining(struct maple_tree *mt) mt_set_non_kernel(0); mtree_destroy(mt); } -static noinline void check_node_overwrite(struct maple_tree *mt) +static noinline void __init check_node_overwrite(struct maple_tree *mt) { int i, max = 4000; @@ -1572,12 +1608,12 @@ static noinline void check_node_overwrite(struct maple_tree *mt) mtree_test_store_range(mt, i*100, i*100 + 50, xa_mk_value(i*100)); mtree_test_store_range(mt, 319951, 367950, NULL); - /*mt_dump(mt); */ + /*mt_dump(mt, mt_dump_dec); */ mt_validate(mt); } #if defined(BENCH_SLOT_STORE) -static noinline void bench_slot_store(struct maple_tree *mt) +static noinline void __init bench_slot_store(struct maple_tree *mt) { int i, brk = 105, max = 1040, brk_start = 100, count = 20000000; @@ -1593,7 +1629,7 @@ static noinline void bench_slot_store(struct maple_tree *mt) #endif #if defined(BENCH_NODE_STORE) -static noinline void bench_node_store(struct maple_tree *mt) +static noinline void __init bench_node_store(struct maple_tree *mt) { int i, overwrite = 76, max = 240, count = 20000000; @@ -1612,7 +1648,7 @@ static noinline void bench_node_store(struct maple_tree *mt) #endif #if defined(BENCH_AWALK) -static noinline void bench_awalk(struct maple_tree *mt) +static noinline void __init bench_awalk(struct maple_tree *mt) { int i, max = 2500, count = 50000000; MA_STATE(mas, mt, 1470, 1470); @@ -1629,7 +1665,7 @@ static noinline void bench_awalk(struct maple_tree *mt) } #endif #if defined(BENCH_WALK) -static noinline void bench_walk(struct maple_tree *mt) +static noinline void __init bench_walk(struct maple_tree *mt) { int i, max = 2500, count = 550000000; MA_STATE(mas, mt, 1470, 1470); @@ -1646,7 +1682,7 @@ static noinline void bench_walk(struct maple_tree *mt) #endif #if defined(BENCH_MT_FOR_EACH) -static noinline void bench_mt_for_each(struct maple_tree *mt) +static noinline void __init bench_mt_for_each(struct maple_tree *mt) { int i, count = 1000000; unsigned long max = 2500, index = 0; @@ -1670,7 +1706,7 @@ static noinline void bench_mt_for_each(struct maple_tree *mt) #endif /* check_forking - simulate the kernel forking sequence with the tree. */ -static noinline void check_forking(struct maple_tree *mt) +static noinline void __init check_forking(struct maple_tree *mt) { struct maple_tree newmt; @@ -1709,7 +1745,7 @@ static noinline void check_forking(struct maple_tree *mt) mtree_destroy(&newmt); } -static noinline void check_iteration(struct maple_tree *mt) +static noinline void __init check_iteration(struct maple_tree *mt) { int i, nr_entries = 125; void *val; @@ -1765,7 +1801,6 @@ static noinline void check_iteration(struct maple_tree *mt) mas.index = 760; mas.last = 765; mas_store(&mas, val); - mas_next(&mas, ULONG_MAX); } i++; } @@ -1777,7 +1812,7 @@ static noinline void check_iteration(struct maple_tree *mt) mt_set_non_kernel(0); } -static noinline void check_mas_store_gfp(struct maple_tree *mt) +static noinline void __init check_mas_store_gfp(struct maple_tree *mt) { struct maple_tree newmt; @@ -1810,7 +1845,7 @@ static noinline void check_mas_store_gfp(struct maple_tree *mt) } #if defined(BENCH_FORK) -static noinline void bench_forking(struct maple_tree *mt) +static noinline void __init bench_forking(struct maple_tree *mt) { struct maple_tree newmt; @@ -1852,15 +1887,17 @@ static noinline void bench_forking(struct maple_tree *mt) } #endif -static noinline void next_prev_test(struct maple_tree *mt) +static noinline void __init next_prev_test(struct maple_tree *mt) { int i, nr_entries; void *val; MA_STATE(mas, mt, 0, 0); struct maple_enode *mn; - unsigned long *level2; - unsigned long level2_64[] = {707, 1000, 710, 715, 720, 725}; - unsigned long level2_32[] = {1747, 2000, 1750, 1755, 1760, 1765}; + static const unsigned long *level2; + static const unsigned long level2_64[] = { 707, 1000, 710, 715, 720, + 725}; + static const unsigned long level2_32[] = { 1747, 2000, 1750, 1755, + 1760, 1765}; if (MAPLE_32BIT) { nr_entries = 500; @@ -1974,7 +2011,7 @@ static noinline void next_prev_test(struct maple_tree *mt) val = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, val != NULL); - MT_BUG_ON(mt, mas.index != ULONG_MAX); + MT_BUG_ON(mt, mas.index != 0x7d6); MT_BUG_ON(mt, mas.last != ULONG_MAX); val = mas_prev(&mas, 0); @@ -1998,7 +2035,8 @@ static noinline void next_prev_test(struct maple_tree *mt) val = mas_prev(&mas, 0); MT_BUG_ON(mt, val != NULL); MT_BUG_ON(mt, mas.index != 0); - MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.last != 5); + MT_BUG_ON(mt, mas.node != MAS_NONE); mas.index = 0; mas.last = 5; @@ -2010,7 +2048,7 @@ static noinline void next_prev_test(struct maple_tree *mt) val = mas_prev(&mas, 0); MT_BUG_ON(mt, val != NULL); MT_BUG_ON(mt, mas.index != 0); - MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.last != 9); mas_unlock(&mas); mtree_destroy(mt); @@ -2028,7 +2066,7 @@ static noinline void next_prev_test(struct maple_tree *mt) /* Test spanning writes that require balancing right sibling or right cousin */ -static noinline void check_spanning_relatives(struct maple_tree *mt) +static noinline void __init check_spanning_relatives(struct maple_tree *mt) { unsigned long i, nr_entries = 1000; @@ -2041,7 +2079,7 @@ static noinline void check_spanning_relatives(struct maple_tree *mt) mtree_store_range(mt, 9365, 9955, NULL, GFP_KERNEL); } -static noinline void check_fuzzer(struct maple_tree *mt) +static noinline void __init check_fuzzer(struct maple_tree *mt) { /* * 1. Causes a spanning rebalance of a single root node. @@ -2438,7 +2476,7 @@ static noinline void check_fuzzer(struct maple_tree *mt) } /* duplicate the tree with a specific gap */ -static noinline void check_dup_gaps(struct maple_tree *mt, +static noinline void __init check_dup_gaps(struct maple_tree *mt, unsigned long nr_entries, bool zero_start, unsigned long gap) { @@ -2478,7 +2516,7 @@ static noinline void check_dup_gaps(struct maple_tree *mt, } /* Duplicate many sizes of trees. Mainly to test expected entry values */ -static noinline void check_dup(struct maple_tree *mt) +static noinline void __init check_dup(struct maple_tree *mt) { int i; int big_start = 100010; @@ -2566,7 +2604,7 @@ static noinline void check_dup(struct maple_tree *mt) } } -static noinline void check_bnode_min_spanning(struct maple_tree *mt) +static noinline void __init check_bnode_min_spanning(struct maple_tree *mt) { int i = 50; MA_STATE(mas, mt, 0, 0); @@ -2585,7 +2623,7 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt) mt_set_non_kernel(0); } -static noinline void check_empty_area_window(struct maple_tree *mt) +static noinline void __init check_empty_area_window(struct maple_tree *mt) { unsigned long i, nr_entries = 20; MA_STATE(mas, mt, 0, 0); @@ -2660,7 +2698,7 @@ static noinline void check_empty_area_window(struct maple_tree *mt) MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY); mas_reset(&mas); - MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EINVAL); mas_reset(&mas); mas_empty_area(&mas, 100, 165, 3); @@ -2670,7 +2708,7 @@ static noinline void check_empty_area_window(struct maple_tree *mt) rcu_read_unlock(); } -static noinline void check_empty_area_fill(struct maple_tree *mt) +static noinline void __init check_empty_area_fill(struct maple_tree *mt) { const unsigned long max = 0x25D78000; unsigned long size; @@ -2713,12 +2751,635 @@ static noinline void check_empty_area_fill(struct maple_tree *mt) mt_set_non_kernel(0); } +/* + * Check MAS_START, MAS_PAUSE, active (implied), and MAS_NONE transitions. + * + * The table below shows the single entry tree (0-0 pointer) and normal tree + * with nodes. + * + * Function ENTRY Start Result index & last + * ┬ ┬ ┬ ┬ ┬ + * │ │ │ │ └─ the final range + * │ │ │ └─ The node value after execution + * │ │ └─ The node value before execution + * │ └─ If the entry exists or does not exists (DNE) + * └─ The function name + * + * Function ENTRY Start Result index & last + * mas_next() + * - after last + * Single entry tree at 0-0 + * ------------------------ + * DNE MAS_START MAS_NONE 1 - oo + * DNE MAS_PAUSE MAS_NONE 1 - oo + * DNE MAS_ROOT MAS_NONE 1 - oo + * when index = 0 + * DNE MAS_NONE MAS_ROOT 0 + * when index > 0 + * DNE MAS_NONE MAS_NONE 1 - oo + * + * Normal tree + * ----------- + * exists MAS_START active range + * DNE MAS_START active set to last range + * exists MAS_PAUSE active range + * DNE MAS_PAUSE active set to last range + * exists MAS_NONE active range + * exists active active range + * DNE active active set to last range + * + * Function ENTRY Start Result index & last + * mas_prev() + * - before index + * Single entry tree at 0-0 + * ------------------------ + * if index > 0 + * exists MAS_START MAS_ROOT 0 + * exists MAS_PAUSE MAS_ROOT 0 + * exists MAS_NONE MAS_ROOT 0 + * + * if index == 0 + * DNE MAS_START MAS_NONE 0 + * DNE MAS_PAUSE MAS_NONE 0 + * DNE MAS_NONE MAS_NONE 0 + * DNE MAS_ROOT MAS_NONE 0 + * + * Normal tree + * ----------- + * exists MAS_START active range + * DNE MAS_START active set to min + * exists MAS_PAUSE active range + * DNE MAS_PAUSE active set to min + * exists MAS_NONE active range + * DNE MAS_NONE MAS_NONE set to min + * any MAS_ROOT MAS_NONE 0 + * exists active active range + * DNE active active last range + * + * Function ENTRY Start Result index & last + * mas_find() + * - at index or next + * Single entry tree at 0-0 + * ------------------------ + * if index > 0 + * DNE MAS_START MAS_NONE 0 + * DNE MAS_PAUSE MAS_NONE 0 + * DNE MAS_ROOT MAS_NONE 0 + * DNE MAS_NONE MAS_NONE 0 + * if index == 0 + * exists MAS_START MAS_ROOT 0 + * exists MAS_PAUSE MAS_ROOT 0 + * exists MAS_NONE MAS_ROOT 0 + * + * Normal tree + * ----------- + * exists MAS_START active range + * DNE MAS_START active set to max + * exists MAS_PAUSE active range + * DNE MAS_PAUSE active set to max + * exists MAS_NONE active range + * exists active active range + * DNE active active last range (max < last) + * + * Function ENTRY Start Result index & last + * mas_find_rev() + * - at index or before + * Single entry tree at 0-0 + * ------------------------ + * if index > 0 + * exists MAS_START MAS_ROOT 0 + * exists MAS_PAUSE MAS_ROOT 0 + * exists MAS_NONE MAS_ROOT 0 + * if index == 0 + * DNE MAS_START MAS_NONE 0 + * DNE MAS_PAUSE MAS_NONE 0 + * DNE MAS_NONE MAS_NONE 0 + * DNE MAS_ROOT MAS_NONE 0 + * + * Normal tree + * ----------- + * exists MAS_START active range + * DNE MAS_START active set to min + * exists MAS_PAUSE active range + * DNE MAS_PAUSE active set to min + * exists MAS_NONE active range + * exists active active range + * DNE active active last range (min > index) + * + * Function ENTRY Start Result index & last + * mas_walk() + * - Look up index + * Single entry tree at 0-0 + * ------------------------ + * if index > 0 + * DNE MAS_START MAS_ROOT 1 - oo + * DNE MAS_PAUSE MAS_ROOT 1 - oo + * DNE MAS_NONE MAS_ROOT 1 - oo + * DNE MAS_ROOT MAS_ROOT 1 - oo + * if index == 0 + * exists MAS_START MAS_ROOT 0 + * exists MAS_PAUSE MAS_ROOT 0 + * exists MAS_NONE MAS_ROOT 0 + * exists MAS_ROOT MAS_ROOT 0 + * + * Normal tree + * ----------- + * exists MAS_START active range + * DNE MAS_START active range of NULL + * exists MAS_PAUSE active range + * DNE MAS_PAUSE active range of NULL + * exists MAS_NONE active range + * DNE MAS_NONE active range of NULL + * exists active active range + * DNE active active range of NULL + */ + +#define mas_active(x) (((x).node != MAS_ROOT) && \ + ((x).node != MAS_START) && \ + ((x).node != MAS_PAUSE) && \ + ((x).node != MAS_NONE)) +static noinline void __init check_state_handling(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 0); + void *entry, *ptr = (void *) 0x1234500; + void *ptr2 = &ptr; + void *ptr3 = &ptr2; + + /* Check MAS_ROOT First */ + mtree_store_range(mt, 0, 0, ptr, GFP_KERNEL); + + mas_lock(&mas); + /* prev: Start -> none */ + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* prev: Start -> root */ + mas_set(&mas, 10); + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* prev: pause -> root */ + mas_set(&mas, 10); + mas_pause(&mas); + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* next: start -> none */ + mas_set(&mas, 0); + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* next: start -> none */ + mas_set(&mas, 10); + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* find: start -> root */ + mas_set(&mas, 0); + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* find: root -> none */ + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* find: none -> none */ + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* find: start -> none */ + mas_set(&mas, 10); + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* find_rev: none -> root */ + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* find_rev: start -> root */ + mas_set(&mas, 0); + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* find_rev: root -> none */ + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* find_rev: none -> none */ + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* find_rev: start -> root */ + mas_set(&mas, 10); + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* walk: start -> none */ + mas_set(&mas, 10); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* walk: pause -> none*/ + mas_set(&mas, 10); + mas_pause(&mas); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* walk: none -> none */ + mas.index = mas.last = 10; + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* walk: none -> none */ + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* walk: start -> root */ + mas_set(&mas, 0); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* walk: pause -> root */ + mas_set(&mas, 0); + mas_pause(&mas); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* walk: none -> root */ + mas.node = MAS_NONE; + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* walk: root -> root */ + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + /* walk: root -> none */ + mas_set(&mas, 10); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, mas.node != MAS_NONE); + + /* walk: none -> root */ + mas.index = mas.last = 0; + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + MT_BUG_ON(mt, mas.node != MAS_ROOT); + + mas_unlock(&mas); + + /* Check when there is an actual node */ + mtree_store_range(mt, 0, 0, NULL, GFP_KERNEL); + mtree_store_range(mt, 0x1000, 0x1500, ptr, GFP_KERNEL); + mtree_store_range(mt, 0x2000, 0x2500, ptr2, GFP_KERNEL); + mtree_store_range(mt, 0x3000, 0x3500, ptr3, GFP_KERNEL); + + mas_lock(&mas); + + /* next: start ->active */ + mas_set(&mas, 0); + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* next: pause ->active */ + mas_set(&mas, 0); + mas_pause(&mas); + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* next: none ->active */ + mas.index = mas.last = 0; + mas.offset = 0; + mas.node = MAS_NONE; + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* next:active ->active */ + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr2); + MT_BUG_ON(mt, mas.index != 0x2000); + MT_BUG_ON(mt, mas.last != 0x2500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* next:active -> active out of range*/ + entry = mas_next(&mas, 0x2999); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x2501); + MT_BUG_ON(mt, mas.last != 0x2fff); + MT_BUG_ON(mt, !mas_active(mas)); + + /* Continue after out of range*/ + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr3); + MT_BUG_ON(mt, mas.index != 0x3000); + MT_BUG_ON(mt, mas.last != 0x3500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* next:active -> active out of range*/ + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x3501); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, !mas_active(mas)); + + /* next: none -> active, skip value at location */ + mas_set(&mas, 0); + entry = mas_next(&mas, ULONG_MAX); + mas.node = MAS_NONE; + mas.offset = 0; + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr2); + MT_BUG_ON(mt, mas.index != 0x2000); + MT_BUG_ON(mt, mas.last != 0x2500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* prev:active ->active */ + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* prev:active -> active out of range*/ + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0x0FFF); + MT_BUG_ON(mt, !mas_active(mas)); + + /* prev: pause ->active */ + mas_set(&mas, 0x3600); + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != ptr3); + mas_pause(&mas); + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != ptr2); + MT_BUG_ON(mt, mas.index != 0x2000); + MT_BUG_ON(mt, mas.last != 0x2500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* prev:active -> active out of range*/ + entry = mas_prev(&mas, 0x1600); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x1501); + MT_BUG_ON(mt, mas.last != 0x1FFF); + MT_BUG_ON(mt, !mas_active(mas)); + + /* prev: active ->active, continue*/ + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find: start ->active */ + mas_set(&mas, 0); + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find: pause ->active */ + mas_set(&mas, 0); + mas_pause(&mas); + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find: start ->active on value */; + mas_set(&mas, 1200); + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find:active ->active */ + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != ptr2); + MT_BUG_ON(mt, mas.index != 0x2000); + MT_BUG_ON(mt, mas.last != 0x2500); + MT_BUG_ON(mt, !mas_active(mas)); + + + /* find:active -> active (NULL)*/ + entry = mas_find(&mas, 0x2700); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x2501); + MT_BUG_ON(mt, mas.last != 0x2FFF); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find: none ->active */ + entry = mas_find(&mas, 0x5000); + MT_BUG_ON(mt, entry != ptr3); + MT_BUG_ON(mt, mas.index != 0x3000); + MT_BUG_ON(mt, mas.last != 0x3500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find:active -> active (NULL) end*/ + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x3501); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find_rev: active (END) ->active */ + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != ptr3); + MT_BUG_ON(mt, mas.index != 0x3000); + MT_BUG_ON(mt, mas.last != 0x3500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find_rev:active ->active */ + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != ptr2); + MT_BUG_ON(mt, mas.index != 0x2000); + MT_BUG_ON(mt, mas.last != 0x2500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find_rev: pause ->active */ + mas_pause(&mas); + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find_rev:active -> active */ + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0x0FFF); + MT_BUG_ON(mt, !mas_active(mas)); + + /* find_rev: start ->active */ + mas_set(&mas, 0x1200); + entry = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* mas_walk start ->active */ + mas_set(&mas, 0x1200); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* mas_walk start ->active */ + mas_set(&mas, 0x1600); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x1501); + MT_BUG_ON(mt, mas.last != 0x1fff); + MT_BUG_ON(mt, !mas_active(mas)); + + /* mas_walk pause ->active */ + mas_set(&mas, 0x1200); + mas_pause(&mas); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* mas_walk pause -> active */ + mas_set(&mas, 0x1600); + mas_pause(&mas); + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x1501); + MT_BUG_ON(mt, mas.last != 0x1fff); + MT_BUG_ON(mt, !mas_active(mas)); + + /* mas_walk none -> active */ + mas_set(&mas, 0x1200); + mas.node = MAS_NONE; + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* mas_walk none -> active */ + mas_set(&mas, 0x1600); + mas.node = MAS_NONE; + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x1501); + MT_BUG_ON(mt, mas.last != 0x1fff); + MT_BUG_ON(mt, !mas_active(mas)); + + /* mas_walk active -> active */ + mas.index = 0x1200; + mas.last = 0x1200; + mas.offset = 0; + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_active(mas)); + + /* mas_walk active -> active */ + mas.index = 0x1600; + mas.last = 0x1600; + entry = mas_walk(&mas); + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x1501); + MT_BUG_ON(mt, mas.last != 0x1fff); + MT_BUG_ON(mt, !mas_active(mas)); + + mas_unlock(&mas); +} + static DEFINE_MTREE(tree); -static int maple_tree_seed(void) +static int __init maple_tree_seed(void) { - unsigned long set[] = {5015, 5014, 5017, 25, 1000, - 1001, 1002, 1003, 1005, 0, - 5003, 5002}; + unsigned long set[] = { 5015, 5014, 5017, 25, 1000, + 1001, 1002, 1003, 1005, 0, + 5003, 5002}; void *ptr = &set; pr_info("\nTEST STARTING\n\n"); @@ -2974,6 +3635,10 @@ static int maple_tree_seed(void) mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_state_handling(&tree); + mtree_destroy(&tree); + #if defined(BENCH) skip: #endif @@ -2988,7 +3653,7 @@ skip: return -EINVAL; } -static void maple_tree_harvest(void) +static void __exit maple_tree_harvest(void) { } diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h index f06df065dec0..2c34e8a33a1c 100644 --- a/lib/zstd/common/zstd_deps.h +++ b/lib/zstd/common/zstd_deps.h @@ -105,21 +105,3 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { #endif /* ZSTD_DEPS_IO */ #endif /* ZSTD_DEPS_NEED_IO */ - -/* - * Only requested when MSAN is enabled. - * Need: - * intptr_t - */ -#ifdef ZSTD_DEPS_NEED_STDINT -#ifndef ZSTD_DEPS_STDINT -#define ZSTD_DEPS_STDINT - -/* - * The Linux Kernel doesn't provide intptr_t, only uintptr_t, which - * is an unsigned long. - */ -typedef long intptr_t; - -#endif /* ZSTD_DEPS_STDINT */ -#endif /* ZSTD_DEPS_NEED_STDINT */ diff --git a/mm/Makefile b/mm/Makefile index e29afc890cde..678530a07326 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o \ + compaction.o show_mem.o\ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) @@ -89,6 +89,7 @@ obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_KMSAN) += kmsan/ obj-$(CONFIG_FAILSLAB) += failslab.o +obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_NUMA) += memory-tiers.o @@ -123,6 +124,7 @@ obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o +obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_DAMON) += damon/ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/compaction.c b/mm/compaction.c index c8bcdea15f5f..2c71fe27bc7b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -392,18 +392,14 @@ void reset_isolation_suitable(pg_data_t *pgdat) * Sets the pageblock skip bit if it was clear. Note that this is a hint as * locks are not required for read/writers. Returns true if it was already set. */ -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) +static bool test_and_set_skip(struct compact_control *cc, struct page *page) { bool skip; - /* Do no update if skip hint is being ignored */ + /* Do not update if skip hint is being ignored */ if (cc->ignore_skip_hint) return false; - if (!pageblock_aligned(pfn)) - return false; - skip = get_pageblock_skip(page); if (!skip && !cc->no_set_skip_hint) set_pageblock_skip(page); @@ -470,8 +466,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { } -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) +static bool test_and_set_skip(struct compact_control *cc, struct page *page) { return false; } @@ -765,6 +760,34 @@ static bool too_many_isolated(pg_data_t *pgdat) return too_many; } +/* + * Check if this base page should be skipped from isolation because + * it has extra refcounts that will prevent it from being migrated. + * This function is called for regular pages only, and not + * for THP or hugetlbfs pages. This code is inspired by similar code + * in migrate_vma_check_page(), can_split_folio() and + * folio_migrate_mapping() + */ +static inline bool page_has_extrarefs(struct page *page, + struct address_space *mapping) +{ + unsigned long extra_refs; + + /* anonymous page can have extra ref from swap cache */ + if (mapping) + extra_refs = 1 + PagePrivate(page); + else + extra_refs = PageSwapCache(page) ? 1 : 0; + + /* + * This is an admittedly racy check but good enough to determine + * if a page is pinned and can not be migrated + */ + if ((page_ref_count(page) - extra_refs) > page_mapcount(page)) + return true; + return false; +} + /** * isolate_migratepages_block() - isolate all migrate-able pages within * a single pageblock @@ -1003,12 +1026,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail; /* - * Migration will fail if an anonymous page is pinned in memory, - * so avoid taking lru_lock and isolating it unnecessarily in an - * admittedly racy check. + * Migration will fail if a page has extra refcounts + * preventing it from migrating, so avoid taking + * lru_lock and isolating it unnecessarily */ mapping = page_mapping(page); - if (!mapping && (page_count(page) - 1) > total_mapcount(page)) + if (page_has_extrarefs(page, mapping)) goto isolate_fail_put; /* @@ -1075,9 +1098,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, lruvec_memcg_debug(lruvec, page_folio(page)); /* Try get exclusive access under lock */ - if (!skip_updated) { + if (!skip_updated && valid_page) { skip_updated = true; - if (test_and_set_skip(cc, page, low_pfn)) + if (test_and_set_skip(cc, valid_page)) goto isolate_abort; } @@ -1736,6 +1759,7 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE */ static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; static int sysctl_extfrag_threshold = 500; +static int __read_mostly sysctl_compact_memory; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) @@ -1864,7 +1888,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } @@ -2456,7 +2479,8 @@ rescan: } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page - * within the current order-aligned block, scan the + * within the current order-aligned block and + * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near * future. This will isolate more pages than necessary @@ -2464,8 +2488,9 @@ rescan: * fast_find_migrateblock revisiting blocks that were * recently partially scanned. */ - if (cc->direct_compaction && !cc->finish_pageblock && - (cc->mode < MIGRATE_SYNC)) { + if (!pageblock_aligned(cc->migrate_pfn) && + !cc->ignore_skip_hint && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { cc->finish_pageblock = true; /* @@ -2780,6 +2805,15 @@ static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int static int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { + int ret; + + ret = proc_dointvec(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (sysctl_compact_memory != 1) + return -EINVAL; + if (write) compact_nodes(); @@ -3095,7 +3129,7 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, static struct ctl_table vm_compaction[] = { { .procname = "compact_memory", - .data = NULL, + .data = &sysctl_compact_memory, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, diff --git a/mm/debug.c b/mm/debug.c index c7b228097bd9..ee533a5ceb79 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -268,4 +268,13 @@ void page_init_poison(struct page *page, size_t size) if (page_init_poisoning) memset(page, PAGE_POISON_PATTERN, size); } + +void vma_iter_dump_tree(const struct vma_iterator *vmi) +{ +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mas_dump(&vmi->mas); + mt_dump(vmi->mas.tree, mt_dump_hex); +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ +} + #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c new file mode 100644 index 000000000000..f9d145730fd1 --- /dev/null +++ b/mm/debug_page_alloc.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/page-isolation.h> + +unsigned int _debug_guardpage_minorder; + +bool _debug_pagealloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +EXPORT_SYMBOL(_debug_pagealloc_enabled); + +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + +static int __init early_debug_pagealloc(char *buf) +{ + return kstrtobool(buf, &_debug_pagealloc_enabled_early); +} +early_param("debug_pagealloc", early_debug_pagealloc); + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + pr_err("Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + pr_info("Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); + +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype) +{ + if (order >= debug_guardpage_minorder()) + return false; + + __SetPageGuard(page); + INIT_LIST_HEAD(&page->buddy_list); + set_page_private(page, order); + /* Guard pages are not available for any usage */ + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; +} + +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype) +{ + __ClearPageGuard(page); + + set_page_private(page, 0); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, (1 << order), migratetype); +} diff --git a/mm/dmapool.c b/mm/dmapool.c index d2b0f8fc9649..a151a21e571b 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -226,7 +226,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; - bool empty = false; + bool empty; if (!dev) return NULL; @@ -276,8 +276,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, */ mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools)) - empty = true; + empty = list_empty(&dev->dma_pools); list_add(&retval->pools, &dev->dma_pools); mutex_unlock(&pools_lock); if (empty) { @@ -361,7 +360,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) void dma_pool_destroy(struct dma_pool *pool) { struct dma_page *page, *tmp; - bool empty = false, busy = false; + bool empty, busy = false; if (unlikely(!pool)) return; @@ -369,8 +368,7 @@ void dma_pool_destroy(struct dma_pool *pool) mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); - if (list_empty(&pool->dev->dma_pools)) - empty = true; + empty = list_empty(&pool->dev->dma_pools); mutex_unlock(&pools_lock); if (empty) device_remove_file(pool->dev, &dev_attr_pools); diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c new file mode 100644 index 000000000000..b1b09cce9394 --- /dev/null +++ b/mm/fail_page_alloc.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fault-inject.h> +#include <linux/mm.h> + +static struct { + struct fault_attr attr; + + bool ignore_gfp_highmem; + bool ignore_gfp_reclaim; + u32 min_order; +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_reclaim = true, + .ignore_gfp_highmem = true, + .min_order = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + int flags = 0; + + if (order < fail_page_alloc.min_order) + return false; + if (gfp_mask & __GFP_NOFAIL) + return false; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return false; + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* See comment in __should_failslab() */ + if (gfp_mask & __GFP_NOWARN) + flags |= FAULT_NOWARN; + + return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_page_alloc", NULL, + &fail_page_alloc.attr); + + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_reclaim); + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); + + return 0; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..570bc8c3db87 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> @@ -58,6 +59,8 @@ #include <asm/mman.h> +#include "swap.h" + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -1625,36 +1628,6 @@ void folio_end_writeback(struct folio *folio) } EXPORT_SYMBOL(folio_end_writeback); -/* - * After completing I/O on a page, call this routine to update the page - * flags appropriately - */ -void page_endio(struct page *page, bool is_write, int err) -{ - struct folio *folio = page_folio(page); - - if (!is_write) { - if (!err) { - folio_mark_uptodate(folio); - } else { - folio_clear_uptodate(folio); - folio_set_error(folio); - } - folio_unlock(folio); - } else { - if (err) { - struct address_space *mapping; - - folio_set_error(folio); - mapping = folio_mapping(folio); - if (mapping) - mapping_set_error(mapping, err); - } - folio_end_writeback(folio); - } -} -EXPORT_SYMBOL_GPL(page_endio); - /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock @@ -4119,3 +4092,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); + +#ifdef CONFIG_CACHESTAT_SYSCALL +/** + * filemap_cachestat() - compute the page cache statistics of a mapping + * @mapping: The mapping to compute the statistics for. + * @first_index: The starting page cache index. + * @last_index: The final page index (inclusive). + * @cs: the cachestat struct to write the result to. + * + * This will query the page cache statistics of a mapping in the + * page range of [first_index, last_index] (inclusive). The statistics + * queried include: number of dirty pages, number of pages marked for + * writeback, and the number of (recently) evicted pages. + */ +static void filemap_cachestat(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) +{ + XA_STATE(xas, &mapping->i_pages, first_index); + struct folio *folio; + + rcu_read_lock(); + xas_for_each(&xas, folio, last_index) { + unsigned long nr_pages; + pgoff_t folio_first_index, folio_last_index; + + if (xas_retry(&xas, folio)) + continue; + + if (xa_is_value(folio)) { + /* page is evicted */ + void *shadow = (void *)folio; + bool workingset; /* not used */ + int order = xa_get_order(xas.xa, xas.xa_index); + + nr_pages = 1 << order; + folio_first_index = round_down(xas.xa_index, 1 << order); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + cs->nr_evicted += nr_pages; + +#ifdef CONFIG_SWAP /* implies CONFIG_MMU */ + if (shmem_mapping(mapping)) { + /* shmem file - in swap cache */ + swp_entry_t swp = radix_to_swp_entry(folio); + + shadow = get_shadow_from_swap_cache(swp); + } +#endif + if (workingset_test_recent(shadow, true, &workingset)) + cs->nr_recently_evicted += nr_pages; + + goto resched; + } + + nr_pages = folio_nr_pages(folio); + folio_first_index = folio_pgoff(folio); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + /* page is in cache */ + cs->nr_cache += nr_pages; + + if (folio_test_dirty(folio)) + cs->nr_dirty += nr_pages; + + if (folio_test_writeback(folio)) + cs->nr_writeback += nr_pages; + +resched: + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * The cachestat(2) system call. + * + * cachestat() returns the page cache statistics of a file in the + * bytes range specified by `off` and `len`: number of cached pages, + * number of dirty pages, number of pages marked for writeback, + * number of evicted pages, and number of recently evicted pages. + * + * An evicted page is a page that is previously in the page cache + * but has been evicted since. A page is recently evicted if its last + * eviction was recent enough that its reentry to the cache would + * indicate that it is actively being used by the system, and that + * there is memory pressure on the system. + * + * `off` and `len` must be non-negative integers. If `len` > 0, + * the queried range is [`off`, `off` + `len`]. If `len` == 0, + * we will query in the range from `off` to the end of the file. + * + * The `flags` argument is unused for now, but is included for future + * extensibility. User should pass 0 (i.e no flag specified). + * + * Currently, hugetlbfs is not supported. + * + * Because the status of a page can change after cachestat() checks it + * but before it returns to the application, the returned values may + * contain stale information. + * + * return values: + * zero - success + * -EFAULT - cstat or cstat_range points to an illegal address + * -EINVAL - invalid flags + * -EBADF - invalid file descriptor + * -EOPNOTSUPP - file descriptor is of a hugetlbfs file + */ +SYSCALL_DEFINE4(cachestat, unsigned int, fd, + struct cachestat_range __user *, cstat_range, + struct cachestat __user *, cstat, unsigned int, flags) +{ + struct fd f = fdget(fd); + struct address_space *mapping; + struct cachestat_range csr; + struct cachestat cs; + pgoff_t first_index, last_index; + + if (!f.file) + return -EBADF; + + if (copy_from_user(&csr, cstat_range, + sizeof(struct cachestat_range))) { + fdput(f); + return -EFAULT; + } + + /* hugetlbfs is not supported */ + if (is_file_hugepages(f.file)) { + fdput(f); + return -EOPNOTSUPP; + } + + if (flags != 0) { + fdput(f); + return -EINVAL; + } + + first_index = csr.off >> PAGE_SHIFT; + last_index = + csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; + memset(&cs, 0, sizeof(struct cachestat)); + mapping = f.file->f_mapping; + filemap_cachestat(mapping, first_index, last_index, &cs); + fdput(f); + + if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_CACHESTAT_SYSCALL */ @@ -18,6 +18,7 @@ #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> +#include <linux/shmem_fs.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -959,16 +960,54 @@ static int faultin_page(struct vm_area_struct *vma, return 0; } +/* + * Writing to file-backed mappings which require folio dirty tracking using GUP + * is a fundamentally broken operation, as kernel write access to GUP mappings + * do not adhere to the semantics expected by a file system. + * + * Consider the following scenario:- + * + * 1. A folio is written to via GUP which write-faults the memory, notifying + * the file system and dirtying the folio. + * 2. Later, writeback is triggered, resulting in the folio being cleaned and + * the PTE being marked read-only. + * 3. The GUP caller writes to the folio, as it is mapped read/write via the + * direct mapping. + * 4. The GUP caller, now done with the page, unpins it and sets it dirty + * (though it does not have to). + * + * This results in both data being written to a folio without writenotify, and + * the folio being dirtied unexpectedly (if the caller decides to do so). + */ +static bool writable_file_mapping_allowed(struct vm_area_struct *vma, + unsigned long gup_flags) +{ + /* + * If we aren't pinning then no problematic write can occur. A long term + * pin is the most egregious case so this is the case we disallow. + */ + if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != + (FOLL_PIN | FOLL_LONGTERM)) + return true; + + /* + * If the VMA does not require dirty tracking then no problematic write + * can occur either. + */ + return !vma_needs_dirty_tracking(vma); +} + static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; int write = (gup_flags & FOLL_WRITE); int foreign = (gup_flags & FOLL_REMOTE); + bool vma_anon = vma_is_anonymous(vma); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; - if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) + if ((gup_flags & FOLL_ANON) && !vma_anon) return -EFAULT; if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) @@ -978,6 +1017,10 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) return -EFAULT; if (write) { + if (!vma_anon && + !writable_file_mapping_allowed(vma, gup_flags)) + return -EFAULT; + if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; @@ -1024,8 +1067,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the @@ -1039,8 +1080,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * * The caller is responsible for releasing returned @pages, via put_page(). * - * @vmas are valid only as long as mmap_lock is held. - * * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to @@ -1076,7 +1115,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; @@ -1116,9 +1155,9 @@ static long __get_user_pages(struct mm_struct *mm, goto out; if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, - gup_flags, locked); + i = follow_hugetlb_page(mm, vma, pages, + &start, &nr_pages, i, + gup_flags, locked); if (!*locked) { /* * We've got a VM_FAULT_RETRY @@ -1183,10 +1222,6 @@ retry: ctx.page_mask = 0; } next_page: - if (vmas) { - vmas[i] = vma; - ctx.page_mask = 0; - } page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; @@ -1341,7 +1376,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, unsigned int flags) { @@ -1379,7 +1413,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, - vmas, locked); + locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; @@ -1443,7 +1477,7 @@ retry: *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, - pages, NULL, locked); + pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); @@ -1541,7 +1575,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, NULL, locked ? locked : &local_locked); + NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; } @@ -1599,7 +1633,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, return -EINVAL; ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, NULL, locked); + NULL, locked); lru_add_drain(); return ret; } @@ -1667,8 +1701,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, - unsigned int foll_flags) + int *locked, unsigned int foll_flags) { struct vm_area_struct *vma; bool must_unlock = false; @@ -1712,8 +1745,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, if (pages[i]) get_page(pages[i]); } - if (vmas) - vmas[i] = vma; + start = (start + PAGE_SIZE) & PAGE_MASK; } @@ -1894,8 +1926,7 @@ struct page *get_dump_page(unsigned long addr) int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL, - &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } @@ -2068,7 +2099,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, unsigned int gup_flags) { @@ -2076,13 +2106,13 @@ static long __gup_longterm_locked(struct mm_struct *mm, long rc, nr_pinned_pages; if (!(gup_flags & FOLL_LONGTERM)) - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, locked, + pages, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; @@ -2100,9 +2130,8 @@ static long __gup_longterm_locked(struct mm_struct *mm, * Check that the given flags are valid for the exported gup/pup interface, and * update them with the required flags that the caller must have set. */ -static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, - int *locked, unsigned int *gup_flags_p, - unsigned int to_set) +static bool is_valid_gup_args(struct page **pages, int *locked, + unsigned int *gup_flags_p, unsigned int to_set) { unsigned int gup_flags = *gup_flags_p; @@ -2144,13 +2173,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, (gup_flags & FOLL_PCI_P2PDMA))) return false; - /* - * Can't use VMAs with locked, as locked allows GUP to unlock - * which invalidates the vmas array - */ - if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE))) - return false; - *gup_flags_p = gup_flags; return true; } @@ -2165,8 +2187,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. @@ -2181,8 +2201,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * * The caller is responsible for releasing returned @pages, via put_page(). * - * @vmas are valid only as long as mmap_lock is held. - * * Must be called with mmap_lock held for read or write. * * get_user_pages_remote walks a process's page tables and takes a reference @@ -2219,15 +2237,15 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { int local_locked = 1; - if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } @@ -2237,7 +2255,7 @@ EXPORT_SYMBOL(get_user_pages_remote); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { return 0; } @@ -2251,8 +2269,6 @@ long get_user_pages_remote(struct mm_struct *mm, * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * * This is the same as get_user_pages_remote(), just with a less-flexible * calling convention where we assume that the mm being operated on belongs to @@ -2260,16 +2276,15 @@ long get_user_pages_remote(struct mm_struct *mm, * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + unsigned int gup_flags, struct page **pages) { int locked = 1; - if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - vmas, &locked, gup_flags); + &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); @@ -2293,12 +2308,12 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH | FOLL_UNLOCKABLE)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - NULL, &locked, gup_flags); + &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -2337,6 +2352,82 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ #ifdef CONFIG_HAVE_FAST_GUP +/* + * Used in the GUP-fast path to determine whether a pin is permitted for a + * specific folio. + * + * This call assumes the caller has pinned the folio, that the lowest page table + * level still points to this folio, and that interrupts have been disabled. + * + * Writing to pinned file-backed dirty tracked folios is inherently problematic + * (see comment describing the writable_file_mapping_allowed() function). We + * therefore try to avoid the most egregious case of a long-term mapping doing + * so. + * + * This function cannot be as thorough as that one as the VMA is not available + * in the fast path, so instead we whitelist known good cases and if in doubt, + * fall back to the slow path. + */ +static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) +{ + struct address_space *mapping; + unsigned long mapping_flags; + + /* + * If we aren't pinning then no problematic write can occur. A long term + * pin is the most egregious case so this is the one we disallow. + */ + if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != + (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) + return true; + + /* The folio is pinned, so we can safely access folio fields. */ + + if (WARN_ON_ONCE(folio_test_slab(folio))) + return false; + + /* hugetlb mappings do not require dirty-tracking. */ + if (folio_test_hugetlb(folio)) + return true; + + /* + * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods + * cannot proceed, which means no actions performed under RCU can + * proceed either. + * + * inodes and thus their mappings are freed under RCU, which means the + * mapping cannot be freed beneath us and thus we can safely dereference + * it. + */ + lockdep_assert_irqs_disabled(); + + /* + * However, there may be operations which _alter_ the mapping, so ensure + * we read it once and only once. + */ + mapping = READ_ONCE(folio->mapping); + + /* + * The mapping may have been truncated, in any case we cannot determine + * if this mapping is safe - fall back to slow path to determine how to + * proceed. + */ + if (!mapping) + return false; + + /* Anonymous folios pose no problem. */ + mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; + if (mapping_flags) + return mapping_flags & PAGE_MAPPING_ANON; + + /* + * At this point, we know the mapping is non-null and points to an + * address_space object. The only remaining whitelisted file system is + * shmem. + */ + return shmem_mapping(mapping); +} + static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) @@ -2422,6 +2513,11 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, goto pte_unmap; } + if (!folio_fast_pin_allowed(folio, flags)) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; + } + if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; @@ -2614,6 +2710,11 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } + if (!folio_fast_pin_allowed(folio, flags)) { + gup_put_folio(folio, refs, flags); + return 0; + } + if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; @@ -2680,6 +2781,10 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + if (!folio_fast_pin_allowed(folio, flags)) { + gup_put_folio(folio, refs, flags); + return 0; + } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; @@ -2720,6 +2825,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + if (!folio_fast_pin_allowed(folio, flags)) { + gup_put_folio(folio, refs, flags); + return 0; + } + if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; @@ -2755,6 +2865,16 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } + if (!folio_fast_pin_allowed(folio, flags)) { + gup_put_folio(folio, refs, flags); + return 0; + } + + if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return 0; + } + *nr += refs; folio_set_referenced(folio); return 1; @@ -2983,7 +3103,7 @@ static int internal_get_user_pages_fast(unsigned long start, start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, - pages, NULL, &locked, + pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* @@ -3025,7 +3145,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; @@ -3058,7 +3178,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } @@ -3083,7 +3203,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } @@ -3098,8 +3218,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. @@ -3114,14 +3232,14 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { int local_locked = 1; - if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, + return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } @@ -3135,8 +3253,6 @@ EXPORT_SYMBOL(pin_user_pages_remote); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. @@ -3145,15 +3261,14 @@ EXPORT_SYMBOL(pin_user_pages_remote); * see Documentation/core-api/pin_user_pages.rst for details. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + unsigned int gup_flags, struct page **pages) { int locked = 1; - if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, &locked, gup_flags); + pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); @@ -3167,11 +3282,11 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; - return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, + return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); diff --git a/mm/gup_test.c b/mm/gup_test.c index 8ae7307a1bb6..1668ce0e0783 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -139,29 +139,27 @@ static int __gup_test_ioctl(unsigned int cmd, pages + i); break; case GUP_BASIC_TEST: - nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, - NULL); + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i); break; case PIN_FAST_BENCHMARK: nr = pin_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, - NULL); + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, gup->gup_flags | FOLL_LONGTERM, - pages + i, NULL); + pages + i); break; case DUMP_USER_PAGES_TEST: if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) nr = pin_user_pages(addr, nr, gup->gup_flags, - pages + i, NULL); + pages + i); else nr = get_user_pages(addr, nr, gup->gup_flags, - pages + i, NULL); + pages + i); break; default: ret = -EINVAL; @@ -271,7 +269,7 @@ static inline int pin_longterm_test_start(unsigned long arg) gup_flags, pages); else cur_pages = pin_user_pages(addr, remaining_pages, - gup_flags, pages, NULL); + gup_flags, pages); if (cur_pages < 0) { pin_longterm_test_stop(); ret = cur_pages; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f154019e6b84..ea24718db4af 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6425,17 +6425,14 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, - int refs, struct page **pages, - struct vm_area_struct **vmas) +static void record_subpages(struct page *page, struct vm_area_struct *vma, + int refs, struct page **pages) { int nr; for (nr = 0; nr < refs; nr++) { if (likely(pages)) pages[nr] = nth_page(page, nr); - if (vmas) - vmas[nr] = vma; } } @@ -6508,9 +6505,9 @@ out_unlock: } long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags, int *locked) + struct page **pages, unsigned long *position, + unsigned long *nr_pages, long i, unsigned int flags, + int *locked) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -6638,7 +6635,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * If subpage information not requested, update counters * and skip the same_page loop below. */ - if (!pages && !vmas && !pfn_offset && + if (!pages && !pfn_offset && (vaddr + huge_page_size(h) < vma->vm_end) && (remainder >= pages_per_huge_page(h))) { vaddr += huge_page_size(h); @@ -6653,11 +6650,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); - if (pages || vmas) - record_subpages_vmas(nth_page(page, pfn_offset), - vma, refs, - likely(pages) ? pages + i : NULL, - vmas ? vmas + i : NULL); + if (pages) + record_subpages(nth_page(page, pfn_offset), + vma, refs, + likely(pages) ? pages + i : NULL); if (pages) { /* diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 27f001e0f0a2..f42079b73f82 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -384,8 +384,9 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, } static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, - gfp_t gfp_mask, struct list_head *list) + struct list_head *list) { + gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; @@ -413,12 +414,11 @@ out: * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. - * @gfp_mask: GFP flag for allocating vmemmap pages. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, gfp_t gfp_mask) + unsigned long reuse) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { @@ -430,7 +430,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); - if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; mmap_read_lock(&init_mm); @@ -476,8 +476,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, - GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse); if (!ret) { ClearHPageVmemmapOptimized(head); static_branch_dec(&hugetlb_optimize_vmemmap_key); diff --git a/mm/internal.h b/mm/internal.h index 68410c6d97ac..bb6542279599 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -208,10 +208,12 @@ extern char * const zone_names[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); -static inline bool is_check_pages_enabled(void) -{ - return static_branch_unlikely(&check_pages_enabled); -} +extern int min_free_kbytes; + +void setup_per_zone_wmarks(void); +void calculate_min_free_kbytes(void); +int __meminit init_per_zone_wmark_min(void); +void page_alloc_sysctl_init(void); /* * Structure for holding the mostly immutable allocation parameters passed @@ -371,6 +373,13 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } +void set_zone_contiguous(struct zone *zone); + +static inline void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); @@ -416,6 +425,10 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); +void memmap_init_range(unsigned long, int, unsigned long, unsigned long, + unsigned long, enum meminit_context, struct vmem_altmap *, int); + + int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); @@ -1047,17 +1060,17 @@ static inline void vma_iter_store(struct vma_iterator *vmi, { #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) { - printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); - printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); - printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); - mt_dump(vmi->mas.tree); + if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + vmi->mas.index > vma->vm_start)) { + pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", + vmi->mas.index, vma->vm_start, vma->vm_start, + vma->vm_end, vmi->mas.index, vmi->mas.last); } - if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { - printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); - printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); - printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); - mt_dump(vmi->mas.tree); + if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + vmi->mas.last < vma->vm_start)) { + pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", + vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, + vmi->mas.index, vmi->mas.last); } #endif diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b376a5d055e5..256930da578a 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -445,7 +445,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag bool __kasan_check_byte(const void *address, unsigned long ip) { if (!kasan_byte_accessible(address)) { - kasan_report((unsigned long)address, 1, false, ip); + kasan_report(address, 1, false, ip); return false; } return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index e5eef670735e..224d161a5a22 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -40,39 +40,39 @@ * depending on memory access size X. */ -static __always_inline bool memory_is_poisoned_1(unsigned long addr) +static __always_inline bool memory_is_poisoned_1(const void *addr) { - s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); + s8 shadow_value = *(s8 *)kasan_mem_to_shadow(addr); if (unlikely(shadow_value)) { - s8 last_accessible_byte = addr & KASAN_GRANULE_MASK; + s8 last_accessible_byte = (unsigned long)addr & KASAN_GRANULE_MASK; return unlikely(last_accessible_byte >= shadow_value); } return false; } -static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, +static __always_inline bool memory_is_poisoned_2_4_8(const void *addr, unsigned long size) { - u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow(addr); /* * Access crosses 8(shadow size)-byte boundary. Such access maps * into 2 shadow bytes, so we need to check them both. */ - if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1)) + if (unlikely((((unsigned long)addr + size - 1) & KASAN_GRANULE_MASK) < size - 1)) return *shadow_addr || memory_is_poisoned_1(addr + size - 1); return memory_is_poisoned_1(addr + size - 1); } -static __always_inline bool memory_is_poisoned_16(unsigned long addr) +static __always_inline bool memory_is_poisoned_16(const void *addr) { - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow(addr); /* Unaligned 16-bytes access maps into 3 shadow bytes. */ - if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE))) + if (unlikely(!IS_ALIGNED((unsigned long)addr, KASAN_GRANULE_SIZE))) return *shadow_addr || memory_is_poisoned_1(addr + 15); return *shadow_addr; @@ -120,26 +120,25 @@ static __always_inline unsigned long memory_is_nonzero(const void *start, return bytes_is_nonzero(start, (end - start) % 8); } -static __always_inline bool memory_is_poisoned_n(unsigned long addr, - size_t size) +static __always_inline bool memory_is_poisoned_n(const void *addr, size_t size) { unsigned long ret; - ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), - kasan_mem_to_shadow((void *)addr + size - 1) + 1); + ret = memory_is_nonzero(kasan_mem_to_shadow(addr), + kasan_mem_to_shadow(addr + size - 1) + 1); if (unlikely(ret)) { - unsigned long last_byte = addr + size - 1; - s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); + const void *last_byte = addr + size - 1; + s8 *last_shadow = (s8 *)kasan_mem_to_shadow(last_byte); if (unlikely(ret != (unsigned long)last_shadow || - ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow))) + (((long)last_byte & KASAN_GRANULE_MASK) >= *last_shadow))) return true; } return false; } -static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) +static __always_inline bool memory_is_poisoned(const void *addr, size_t size) { if (__builtin_constant_p(size)) { switch (size) { @@ -159,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } -static __always_inline bool check_region_inline(unsigned long addr, +static __always_inline bool check_region_inline(const void *addr, size_t size, bool write, unsigned long ret_ip) { @@ -172,7 +171,7 @@ static __always_inline bool check_region_inline(unsigned long addr, if (unlikely(addr + size < addr)) return !kasan_report(addr, size, write, ret_ip); - if (unlikely(!addr_has_metadata((void *)addr))) + if (unlikely(!addr_has_metadata(addr))) return !kasan_report(addr, size, write, ret_ip); if (likely(!memory_is_poisoned(addr, size))) @@ -181,7 +180,7 @@ static __always_inline bool check_region_inline(unsigned long addr, return !kasan_report(addr, size, write, ret_ip); } -bool kasan_check_range(unsigned long addr, size_t size, bool write, +bool kasan_check_range(const void *addr, size_t size, bool write, unsigned long ret_ip) { return check_region_inline(addr, size, write, ret_ip); @@ -221,36 +220,37 @@ static void register_global(struct kasan_global *global) KASAN_GLOBAL_REDZONE, false); } -void __asan_register_globals(struct kasan_global *globals, size_t size) +void __asan_register_globals(void *ptr, ssize_t size) { int i; + struct kasan_global *globals = ptr; for (i = 0; i < size; i++) register_global(&globals[i]); } EXPORT_SYMBOL(__asan_register_globals); -void __asan_unregister_globals(struct kasan_global *globals, size_t size) +void __asan_unregister_globals(void *ptr, ssize_t size) { } EXPORT_SYMBOL(__asan_unregister_globals); #define DEFINE_ASAN_LOAD_STORE(size) \ - void __asan_load##size(unsigned long addr) \ + void __asan_load##size(void *addr) \ { \ check_region_inline(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_load##size); \ __alias(__asan_load##size) \ - void __asan_load##size##_noabort(unsigned long); \ + void __asan_load##size##_noabort(void *); \ EXPORT_SYMBOL(__asan_load##size##_noabort); \ - void __asan_store##size(unsigned long addr) \ + void __asan_store##size(void *addr) \ { \ check_region_inline(addr, size, true, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_store##size); \ __alias(__asan_store##size) \ - void __asan_store##size##_noabort(unsigned long); \ + void __asan_store##size##_noabort(void *); \ EXPORT_SYMBOL(__asan_store##size##_noabort) DEFINE_ASAN_LOAD_STORE(1); @@ -259,24 +259,24 @@ DEFINE_ASAN_LOAD_STORE(4); DEFINE_ASAN_LOAD_STORE(8); DEFINE_ASAN_LOAD_STORE(16); -void __asan_loadN(unsigned long addr, size_t size) +void __asan_loadN(void *addr, ssize_t size) { kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_loadN); __alias(__asan_loadN) -void __asan_loadN_noabort(unsigned long, size_t); +void __asan_loadN_noabort(void *, ssize_t); EXPORT_SYMBOL(__asan_loadN_noabort); -void __asan_storeN(unsigned long addr, size_t size) +void __asan_storeN(void *addr, ssize_t size) { kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__asan_storeN); __alias(__asan_storeN) -void __asan_storeN_noabort(unsigned long, size_t); +void __asan_storeN_noabort(void *, ssize_t); EXPORT_SYMBOL(__asan_storeN_noabort); /* to shut up compiler complaints */ @@ -284,7 +284,7 @@ void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); /* Emitted by compiler to poison alloca()ed objects. */ -void __asan_alloca_poison(unsigned long addr, size_t size) +void __asan_alloca_poison(void *addr, ssize_t size) { size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE); size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - @@ -295,7 +295,7 @@ void __asan_alloca_poison(unsigned long addr, size_t size) KASAN_ALLOCA_REDZONE_SIZE); const void *right_redzone = (const void *)(addr + rounded_up_size); - WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); + WARN_ON(!IS_ALIGNED((unsigned long)addr, KASAN_ALLOCA_REDZONE_SIZE)); kasan_unpoison((const void *)(addr + rounded_down_size), size - rounded_down_size, false); @@ -307,18 +307,18 @@ void __asan_alloca_poison(unsigned long addr, size_t size) EXPORT_SYMBOL(__asan_alloca_poison); /* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ -void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) +void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom) { - if (unlikely(!stack_top || stack_top > stack_bottom)) + if (unlikely(!stack_top || stack_top > (void *)stack_bottom)) return; - kasan_unpoison(stack_top, stack_bottom - stack_top, false); + kasan_unpoison(stack_top, (void *)stack_bottom - stack_top, false); } EXPORT_SYMBOL(__asan_allocas_unpoison); /* Emitted by the compiler to [un]poison local variables. */ #define DEFINE_ASAN_SET_SHADOW(byte) \ - void __asan_set_shadow_##byte(const void *addr, size_t size) \ + void __asan_set_shadow_##byte(const void *addr, ssize_t size) \ { \ __memset((void *)addr, 0x##byte, size); \ } \ diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index f5e4f5f2ba20..b799f11e45dc 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -198,13 +198,13 @@ enum kasan_report_type { struct kasan_report_info { /* Filled in by kasan_report_*(). */ enum kasan_report_type type; - void *access_addr; + const void *access_addr; size_t access_size; bool is_write; unsigned long ip; /* Filled in by the common reporting code. */ - void *first_bad_addr; + const void *first_bad_addr; struct kmem_cache *cache; void *object; size_t alloc_size; @@ -311,7 +311,7 @@ static __always_inline bool addr_has_metadata(const void *addr) * @ret_ip: return address * @return: true if access was valid, false if invalid */ -bool kasan_check_range(unsigned long addr, size_t size, bool write, +bool kasan_check_range(const void *addr, size_t size, bool write, unsigned long ret_ip); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ @@ -323,7 +323,7 @@ static __always_inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ -void *kasan_find_first_bad_addr(void *addr, size_t size); +const void *kasan_find_first_bad_addr(const void *addr, size_t size); size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache); void kasan_complete_mode_report_info(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); @@ -346,7 +346,7 @@ void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } #endif -bool kasan_report(unsigned long addr, size_t size, +bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); @@ -571,79 +571,82 @@ void kasan_restore_multi_shot(bool enabled); */ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); -void __asan_register_globals(struct kasan_global *globals, size_t size); -void __asan_unregister_globals(struct kasan_global *globals, size_t size); +void __asan_register_globals(void *globals, ssize_t size); +void __asan_unregister_globals(void *globals, ssize_t size); void __asan_handle_no_return(void); -void __asan_alloca_poison(unsigned long addr, size_t size); -void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); - -void __asan_load1(unsigned long addr); -void __asan_store1(unsigned long addr); -void __asan_load2(unsigned long addr); -void __asan_store2(unsigned long addr); -void __asan_load4(unsigned long addr); -void __asan_store4(unsigned long addr); -void __asan_load8(unsigned long addr); -void __asan_store8(unsigned long addr); -void __asan_load16(unsigned long addr); -void __asan_store16(unsigned long addr); -void __asan_loadN(unsigned long addr, size_t size); -void __asan_storeN(unsigned long addr, size_t size); - -void __asan_load1_noabort(unsigned long addr); -void __asan_store1_noabort(unsigned long addr); -void __asan_load2_noabort(unsigned long addr); -void __asan_store2_noabort(unsigned long addr); -void __asan_load4_noabort(unsigned long addr); -void __asan_store4_noabort(unsigned long addr); -void __asan_load8_noabort(unsigned long addr); -void __asan_store8_noabort(unsigned long addr); -void __asan_load16_noabort(unsigned long addr); -void __asan_store16_noabort(unsigned long addr); -void __asan_loadN_noabort(unsigned long addr, size_t size); -void __asan_storeN_noabort(unsigned long addr, size_t size); - -void __asan_report_load1_noabort(unsigned long addr); -void __asan_report_store1_noabort(unsigned long addr); -void __asan_report_load2_noabort(unsigned long addr); -void __asan_report_store2_noabort(unsigned long addr); -void __asan_report_load4_noabort(unsigned long addr); -void __asan_report_store4_noabort(unsigned long addr); -void __asan_report_load8_noabort(unsigned long addr); -void __asan_report_store8_noabort(unsigned long addr); -void __asan_report_load16_noabort(unsigned long addr); -void __asan_report_store16_noabort(unsigned long addr); -void __asan_report_load_n_noabort(unsigned long addr, size_t size); -void __asan_report_store_n_noabort(unsigned long addr, size_t size); - -void __asan_set_shadow_00(const void *addr, size_t size); -void __asan_set_shadow_f1(const void *addr, size_t size); -void __asan_set_shadow_f2(const void *addr, size_t size); -void __asan_set_shadow_f3(const void *addr, size_t size); -void __asan_set_shadow_f5(const void *addr, size_t size); -void __asan_set_shadow_f8(const void *addr, size_t size); - -void *__asan_memset(void *addr, int c, size_t len); -void *__asan_memmove(void *dest, const void *src, size_t len); -void *__asan_memcpy(void *dest, const void *src, size_t len); - -void __hwasan_load1_noabort(unsigned long addr); -void __hwasan_store1_noabort(unsigned long addr); -void __hwasan_load2_noabort(unsigned long addr); -void __hwasan_store2_noabort(unsigned long addr); -void __hwasan_load4_noabort(unsigned long addr); -void __hwasan_store4_noabort(unsigned long addr); -void __hwasan_load8_noabort(unsigned long addr); -void __hwasan_store8_noabort(unsigned long addr); -void __hwasan_load16_noabort(unsigned long addr); -void __hwasan_store16_noabort(unsigned long addr); -void __hwasan_loadN_noabort(unsigned long addr, size_t size); -void __hwasan_storeN_noabort(unsigned long addr, size_t size); - -void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); - -void *__hwasan_memset(void *addr, int c, size_t len); -void *__hwasan_memmove(void *dest, const void *src, size_t len); -void *__hwasan_memcpy(void *dest, const void *src, size_t len); +void __asan_alloca_poison(void *, ssize_t size); +void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom); + +void __asan_load1(void *); +void __asan_store1(void *); +void __asan_load2(void *); +void __asan_store2(void *); +void __asan_load4(void *); +void __asan_store4(void *); +void __asan_load8(void *); +void __asan_store8(void *); +void __asan_load16(void *); +void __asan_store16(void *); +void __asan_loadN(void *, ssize_t size); +void __asan_storeN(void *, ssize_t size); + +void __asan_load1_noabort(void *); +void __asan_store1_noabort(void *); +void __asan_load2_noabort(void *); +void __asan_store2_noabort(void *); +void __asan_load4_noabort(void *); +void __asan_store4_noabort(void *); +void __asan_load8_noabort(void *); +void __asan_store8_noabort(void *); +void __asan_load16_noabort(void *); +void __asan_store16_noabort(void *); +void __asan_loadN_noabort(void *, ssize_t size); +void __asan_storeN_noabort(void *, ssize_t size); + +void __asan_report_load1_noabort(void *); +void __asan_report_store1_noabort(void *); +void __asan_report_load2_noabort(void *); +void __asan_report_store2_noabort(void *); +void __asan_report_load4_noabort(void *); +void __asan_report_store4_noabort(void *); +void __asan_report_load8_noabort(void *); +void __asan_report_store8_noabort(void *); +void __asan_report_load16_noabort(void *); +void __asan_report_store16_noabort(void *); +void __asan_report_load_n_noabort(void *, ssize_t size); +void __asan_report_store_n_noabort(void *, ssize_t size); + +void __asan_set_shadow_00(const void *addr, ssize_t size); +void __asan_set_shadow_f1(const void *addr, ssize_t size); +void __asan_set_shadow_f2(const void *addr, ssize_t size); +void __asan_set_shadow_f3(const void *addr, ssize_t size); +void __asan_set_shadow_f5(const void *addr, ssize_t size); +void __asan_set_shadow_f8(const void *addr, ssize_t size); + +void *__asan_memset(void *addr, int c, ssize_t len); +void *__asan_memmove(void *dest, const void *src, ssize_t len); +void *__asan_memcpy(void *dest, const void *src, ssize_t len); + +void __hwasan_load1_noabort(void *); +void __hwasan_store1_noabort(void *); +void __hwasan_load2_noabort(void *); +void __hwasan_store2_noabort(void *); +void __hwasan_load4_noabort(void *); +void __hwasan_store4_noabort(void *); +void __hwasan_load8_noabort(void *); +void __hwasan_store8_noabort(void *); +void __hwasan_load16_noabort(void *); +void __hwasan_store16_noabort(void *); +void __hwasan_loadN_noabort(void *, ssize_t size); +void __hwasan_storeN_noabort(void *, ssize_t size); + +void __hwasan_tag_memory(void *, u8 tag, ssize_t size); + +void *__hwasan_memset(void *addr, int c, ssize_t len); +void *__hwasan_memmove(void *dest, const void *src, ssize_t len); +void *__hwasan_memcpy(void *dest, const void *src, ssize_t len); + +void kasan_tag_mismatch(void *addr, unsigned long access_info, + unsigned long ret_ip); #endif /* __MM_KASAN_KASAN_H */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 892a9dc9d4d3..84d9f3b37014 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -211,7 +211,7 @@ static void start_report(unsigned long *flags, bool sync) pr_err("==================================================================\n"); } -static void end_report(unsigned long *flags, void *addr) +static void end_report(unsigned long *flags, const void *addr) { if (addr) trace_error_report_end(ERROR_DETECTOR_KASAN, @@ -450,8 +450,8 @@ static void print_memory_metadata(const void *addr) static void print_report(struct kasan_report_info *info) { - void *addr = kasan_reset_tag(info->access_addr); - u8 tag = get_tag(info->access_addr); + void *addr = kasan_reset_tag((void *)info->access_addr); + u8 tag = get_tag((void *)info->access_addr); print_error_description(info); if (addr_has_metadata(addr)) @@ -468,12 +468,12 @@ static void print_report(struct kasan_report_info *info) static void complete_report_info(struct kasan_report_info *info) { - void *addr = kasan_reset_tag(info->access_addr); + void *addr = kasan_reset_tag((void *)info->access_addr); struct slab *slab; if (info->type == KASAN_REPORT_ACCESS) info->first_bad_addr = kasan_find_first_bad_addr( - info->access_addr, info->access_size); + (void *)info->access_addr, info->access_size); else info->first_bad_addr = addr; @@ -544,11 +544,10 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty * user_access_save/restore(): kasan_report_invalid_free() cannot be called * from a UACCESS region, and kasan_report_async() is not used on x86. */ -bool kasan_report(unsigned long addr, size_t size, bool is_write, +bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip) { bool ret = true; - void *ptr = (void *)addr; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; struct kasan_report_info info; @@ -562,7 +561,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; - info.access_addr = ptr; + info.access_addr = addr; info.access_size = size; info.is_write = is_write; info.ip = ip; @@ -571,7 +570,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, print_report(&info); - end_report(&irq_flags, ptr); + end_report(&irq_flags, (void *)addr); out: user_access_restore(ua_flags); diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 87d39bc0a673..51a1e8a8877f 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -30,9 +30,9 @@ #include "kasan.h" #include "../slab.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { - void *p = addr; + const void *p = addr; if (!addr_has_metadata(p)) return p; @@ -362,14 +362,14 @@ void kasan_print_address_stack_frame(const void *addr) #endif /* CONFIG_KASAN_STACK */ #define DEFINE_ASAN_REPORT_LOAD(size) \ -void __asan_report_load##size##_noabort(unsigned long addr) \ +void __asan_report_load##size##_noabort(void *addr) \ { \ kasan_report(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_report_load##size##_noabort) #define DEFINE_ASAN_REPORT_STORE(size) \ -void __asan_report_store##size##_noabort(unsigned long addr) \ +void __asan_report_store##size##_noabort(void *addr) \ { \ kasan_report(addr, size, true, _RET_IP_); \ } \ @@ -386,13 +386,13 @@ DEFINE_ASAN_REPORT_STORE(4); DEFINE_ASAN_REPORT_STORE(8); DEFINE_ASAN_REPORT_STORE(16); -void __asan_report_load_n_noabort(unsigned long addr, size_t size) +void __asan_report_load_n_noabort(void *addr, ssize_t size) { kasan_report(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_report_load_n_noabort); -void __asan_report_store_n_noabort(unsigned long addr, size_t size) +void __asan_report_store_n_noabort(void *addr, ssize_t size) { kasan_report(addr, size, true, _RET_IP_); } diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 32e80f78de7d..065e1b2fc484 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -15,7 +15,7 @@ #include "kasan.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { /* * Hardware Tag-Based KASAN only calls this function for normal memory diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 8b1f5a73ee6d..689e94f9fe3c 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -30,7 +30,7 @@ #include "kasan.h" #include "../slab.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { u8 tag = get_tag(addr); void *p = kasan_reset_tag(addr); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index c8b86f3273b5..3e62728ae25d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -28,13 +28,13 @@ bool __kasan_check_read(const volatile void *p, unsigned int size) { - return kasan_check_range((unsigned long)p, size, false, _RET_IP_); + return kasan_check_range((void *)p, size, false, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_read); bool __kasan_check_write(const volatile void *p, unsigned int size) { - return kasan_check_range((unsigned long)p, size, true, _RET_IP_); + return kasan_check_range((void *)p, size, true, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_write); @@ -50,7 +50,7 @@ EXPORT_SYMBOL(__kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) { - if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + if (!kasan_check_range(addr, len, true, _RET_IP_)) return NULL; return __memset(addr, c, len); @@ -60,8 +60,8 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memmove(dest, src, len); @@ -71,17 +71,17 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memcpy(dest, src, len); } #endif -void *__asan_memset(void *addr, int c, size_t len) +void *__asan_memset(void *addr, int c, ssize_t len) { - if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + if (!kasan_check_range(addr, len, true, _RET_IP_)) return NULL; return __memset(addr, c, len); @@ -89,10 +89,10 @@ void *__asan_memset(void *addr, int c, size_t len) EXPORT_SYMBOL(__asan_memset); #ifdef __HAVE_ARCH_MEMMOVE -void *__asan_memmove(void *dest, const void *src, size_t len) +void *__asan_memmove(void *dest, const void *src, ssize_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memmove(dest, src, len); @@ -100,10 +100,10 @@ void *__asan_memmove(void *dest, const void *src, size_t len) EXPORT_SYMBOL(__asan_memmove); #endif -void *__asan_memcpy(void *dest, const void *src, size_t len) +void *__asan_memcpy(void *dest, const void *src, ssize_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memcpy(dest, src, len); @@ -111,13 +111,13 @@ void *__asan_memcpy(void *dest, const void *src, size_t len) EXPORT_SYMBOL(__asan_memcpy); #ifdef CONFIG_KASAN_SW_TAGS -void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset); +void *__hwasan_memset(void *addr, int c, ssize_t len) __alias(__asan_memset); EXPORT_SYMBOL(__hwasan_memset); #ifdef __HAVE_ARCH_MEMMOVE -void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove); +void *__hwasan_memmove(void *dest, const void *src, ssize_t len) __alias(__asan_memmove); EXPORT_SYMBOL(__hwasan_memmove); #endif -void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy); +void *__hwasan_memcpy(void *dest, const void *src, ssize_t len) __alias(__asan_memcpy); EXPORT_SYMBOL(__hwasan_memcpy); #endif diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 30da65fa02a1..220b5d4c6876 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -70,8 +70,8 @@ u8 kasan_random_tag(void) return (u8)(state % (KASAN_TAG_MAX + 1)); } -bool kasan_check_range(unsigned long addr, size_t size, bool write, - unsigned long ret_ip) +bool kasan_check_range(const void *addr, size_t size, bool write, + unsigned long ret_ip) { u8 tag; u8 *shadow_first, *shadow_last, *shadow; @@ -133,12 +133,12 @@ bool kasan_byte_accessible(const void *addr) } #define DEFINE_HWASAN_LOAD_STORE(size) \ - void __hwasan_load##size##_noabort(unsigned long addr) \ + void __hwasan_load##size##_noabort(void *addr) \ { \ - kasan_check_range(addr, size, false, _RET_IP_); \ + kasan_check_range(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ - void __hwasan_store##size##_noabort(unsigned long addr) \ + void __hwasan_store##size##_noabort(void *addr) \ { \ kasan_check_range(addr, size, true, _RET_IP_); \ } \ @@ -150,25 +150,25 @@ DEFINE_HWASAN_LOAD_STORE(4); DEFINE_HWASAN_LOAD_STORE(8); DEFINE_HWASAN_LOAD_STORE(16); -void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) +void __hwasan_loadN_noabort(void *addr, ssize_t size) { kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__hwasan_loadN_noabort); -void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) +void __hwasan_storeN_noabort(void *addr, ssize_t size) { kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__hwasan_storeN_noabort); -void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) +void __hwasan_tag_memory(void *addr, u8 tag, ssize_t size) { - kasan_poison((void *)addr, size, tag, false); + kasan_poison(addr, size, tag, false); } EXPORT_SYMBOL(__hwasan_tag_memory); -void kasan_tag_mismatch(unsigned long addr, unsigned long access_info, +void kasan_tag_mismatch(void *addr, unsigned long access_info, unsigned long ret_ip) { kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b27e245a055..d31fb1e2cb33 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -639,7 +639,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) } } -static void do_flush_stats(bool atomic) +static void do_flush_stats(void) { /* * We always flush the entire tree, so concurrent flushers can just @@ -652,30 +652,16 @@ static void do_flush_stats(bool atomic) WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); - if (atomic) - cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup); - else - cgroup_rstat_flush(root_mem_cgroup->css.cgroup); + cgroup_rstat_flush(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); atomic_set(&stats_flush_ongoing, 0); } -static bool should_flush_stats(void) -{ - return atomic_read(&stats_flush_threshold) > num_online_cpus(); -} - void mem_cgroup_flush_stats(void) { - if (should_flush_stats()) - do_flush_stats(false); -} - -void mem_cgroup_flush_stats_atomic(void) -{ - if (should_flush_stats()) - do_flush_stats(true); + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + do_flush_stats(); } void mem_cgroup_flush_stats_ratelimited(void) @@ -690,7 +676,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w) * Always flush here so that flushing in latency-sensitive paths is * as cheap as possible. */ - do_flush_stats(false); + do_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } @@ -1580,13 +1566,10 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) +static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { - struct seq_buf s; int i; - seq_buf_init(&s, buf, bufsize); - /* * Provide statistics on the state of the memory subsystem as * well as cumulative event counters that show past behavior. @@ -1603,21 +1586,21 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) u64 size; size = memcg_page_state_output(memcg, memory_stats[i].idx); - seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); + seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size); if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { size += memcg_page_state_output(memcg, NR_SLAB_RECLAIMABLE_B); - seq_buf_printf(&s, "slab %llu\n", size); + seq_buf_printf(s, "slab %llu\n", size); } } /* Accumulated memory events */ - seq_buf_printf(&s, "pgscan %lu\n", + seq_buf_printf(s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + memcg_events(memcg, PGSCAN_DIRECT) + memcg_events(memcg, PGSCAN_KHUGEPAGED)); - seq_buf_printf(&s, "pgsteal %lu\n", + seq_buf_printf(s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT) + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); @@ -1627,13 +1610,24 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) memcg_vm_event_stat[i] == PGPGOUT) continue; - seq_buf_printf(&s, "%s %lu\n", + seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); } /* The above should easily fit into one page */ - WARN_ON_ONCE(seq_buf_has_overflowed(&s)); + WARN_ON_ONCE(seq_buf_has_overflowed(s)); +} + +static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); + +static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + memcg_stat_format(memcg, s); + else + memcg1_stat_format(memcg, s); + WARN_ON_ONCE(seq_buf_has_overflowed(s)); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -1671,6 +1665,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { /* Use static buffer, for the caller is holding oom_lock. */ static char buf[PAGE_SIZE]; + struct seq_buf s; lockdep_assert_held(&oom_lock); @@ -1693,8 +1688,9 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_info("Memory cgroup stats for "); pr_cont_cgroup_path(memcg->css.cgroup); pr_cont(":"); - memory_stat_format(memcg, buf, sizeof(buf)); - pr_info("%s", buf); + seq_buf_init(&s, buf, sizeof(buf)); + memory_stat_format(memcg, &s); + seq_buf_do_printk(&s, KERN_INFO); } /* @@ -2028,26 +2024,12 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked) mem_cgroup_oom_notify(memcg); - if (locked && !READ_ONCE(memcg->oom_kill_disable)) { - mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, - current->memcg_oom_order); - } else { - schedule(); - mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); - } + schedule(); + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); - if (locked) { + if (locked) mem_cgroup_oom_unlock(memcg); - /* - * There is no guarantee that an OOM-lock contender - * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitly. - */ - memcg_oom_recover(memcg); - } cleanup: current->memcg_in_oom = NULL; css_put(&memcg->css); @@ -2275,7 +2257,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) { + if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; ret = true; } @@ -2290,7 +2272,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) */ static void drain_stock(struct memcg_stock_pcp *stock) { - struct mem_cgroup *old = stock->cached; + struct mem_cgroup *old = READ_ONCE(stock->cached); if (!old) return; @@ -2303,7 +2285,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) } css_put(&old->css); - stock->cached = NULL; + WRITE_ONCE(stock->cached, NULL); } static void drain_local_stock(struct work_struct *dummy) @@ -2338,10 +2320,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) struct memcg_stock_pcp *stock; stock = this_cpu_ptr(&memcg_stock); - if (stock->cached != memcg) { /* reset if necessary */ + if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ drain_stock(stock); css_get(&memcg->css); - stock->cached = memcg; + WRITE_ONCE(stock->cached, memcg); } stock->nr_pages += nr_pages; @@ -2383,7 +2365,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) bool flush = false; rcu_read_lock(); - memcg = stock->cached; + memcg = READ_ONCE(stock->cached); if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; @@ -3208,12 +3190,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, * accumulating over a page of vmstat data or when pgdat or idx * changes. */ - if (stock->cached_objcg != objcg) { + if (READ_ONCE(stock->cached_objcg) != objcg) { old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - stock->cached_objcg = objcg; + WRITE_ONCE(stock->cached_objcg, objcg); stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ @@ -3267,7 +3249,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { + if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } @@ -3279,7 +3261,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { - struct obj_cgroup *old = stock->cached_objcg; + struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) return NULL; @@ -3332,7 +3314,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) stock->cached_pgdat = NULL; } - stock->cached_objcg = NULL; + WRITE_ONCE(stock->cached_objcg, NULL); /* * The `old' objects needs to be released by the caller via * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. @@ -3343,10 +3325,11 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) { + struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); struct mem_cgroup *memcg; - if (stock->cached_objcg) { - memcg = obj_cgroup_memcg(stock->cached_objcg); + if (objcg) { + memcg = obj_cgroup_memcg(objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @@ -3365,10 +3348,10 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (stock->cached_objcg != objcg) { /* reset if necessary */ + if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ old = drain_obj_stock(stock); obj_cgroup_get(objcg); - stock->cached_objcg = objcg; + WRITE_ONCE(stock->cached_objcg, objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; allow_uncharge = true; /* Allow uncharge when objcg changes */ @@ -3699,27 +3682,13 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) if (mem_cgroup_is_root(memcg)) { /* - * We can reach here from irq context through: - * uncharge_batch() - * |--memcg_check_events() - * |--mem_cgroup_threshold() - * |--__mem_cgroup_threshold() - * |--mem_cgroup_usage - * - * rstat flushing is an expensive operation that should not be - * done from irq context; use stale stats in this case. - * Arguably, usage threshold events are not reliable on the root - * memcg anyway since its usage is ill-defined. - * - * Additionally, other call paths through memcg_check_events() - * disable irqs, so make sure we are flushing stats atomically. + * Approximate root's usage from global state. This isn't + * perfect, but the root usage was always an approximation. */ - if (in_task()) - mem_cgroup_flush_stats_atomic(); - val = memcg_page_state(memcg, NR_FILE_PAGES) + - memcg_page_state(memcg, NR_ANON_MAPPED); + val = global_node_page_state(NR_FILE_PAGES) + + global_node_page_state(NR_ANON_MAPPED); if (swap) - val += memcg_page_state(memcg, MEMCG_SWAP); + val += total_swap_pages - get_nr_swap_pages(); } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -4135,9 +4104,8 @@ static const unsigned int memcg1_events[] = { PGMAJFAULT, }; -static int memcg_stat_show(struct seq_file *m, void *v) +static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { - struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; @@ -4152,18 +4120,18 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) - seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), - memcg_events_local(memcg, memcg1_events[i])); + seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), + memcg_events_local(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "%s %lu\n", lru_list_name(i), - memcg_page_state_local(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); + seq_buf_printf(s, "%s %lu\n", lru_list_name(i), + memcg_page_state_local(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; @@ -4171,11 +4139,11 @@ static int memcg_stat_show(struct seq_file *m, void *v) memory = min(memory, READ_ONCE(mi->memory.max)); memsw = min(memsw, READ_ONCE(mi->memsw.max)); } - seq_printf(m, "hierarchical_memory_limit %llu\n", - (u64)memory * PAGE_SIZE); + seq_buf_printf(s, "hierarchical_memory_limit %llu\n", + (u64)memory * PAGE_SIZE); if (do_memsw_account()) - seq_printf(m, "hierarchical_memsw_limit %llu\n", - (u64)memsw * PAGE_SIZE); + seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4183,19 +4151,19 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) - seq_printf(m, "total_%s %llu\n", - vm_event_name(memcg1_events[i]), - (u64)memcg_events(memcg, memcg1_events[i])); + seq_buf_printf(s, "total_%s %llu\n", + vm_event_name(memcg1_events[i]), + (u64)memcg_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "total_%s %llu\n", lru_list_name(i), - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); + seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -4210,12 +4178,10 @@ static int memcg_stat_show(struct seq_file *m, void *v) anon_cost += mz->lruvec.anon_cost; file_cost += mz->lruvec.file_cost; } - seq_printf(m, "anon_cost %lu\n", anon_cost); - seq_printf(m, "file_cost %lu\n", file_cost); + seq_buf_printf(s, "anon_cost %lu\n", anon_cost); + seq_buf_printf(s, "file_cost %lu\n", file_cost); } #endif - - return 0; } static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, @@ -4648,11 +4614,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - /* - * wb_writeback() takes a spinlock and calls - * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep. - */ - mem_cgroup_flush_stats_atomic(); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -5059,6 +5021,8 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p) } #endif +static int memory_stat_show(struct seq_file *m, void *v); + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5091,7 +5055,7 @@ static struct cftype mem_cgroup_legacy_files[] = { }, { .name = "stat", - .seq_show = memcg_stat_show, + .seq_show = memory_stat_show, }, { .name = "force_empty", @@ -6634,10 +6598,12 @@ static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + struct seq_buf s; if (!buf) return -ENOMEM; - memory_stat_format(memcg, buf, PAGE_SIZE); + seq_buf_init(&s, buf, PAGE_SIZE); + memory_stat_format(memcg, &s); seq_puts(m, buf); kfree(buf); return 0; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5b663eca1f29..004a02f44271 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -123,7 +123,6 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; -#ifdef CONFIG_SYSCTL static struct ctl_table memory_failure_table[] = { { .procname = "memory_failure_early_kill", @@ -146,14 +145,6 @@ static struct ctl_table memory_failure_table[] = { { } }; -static int __init memory_failure_sysctl_init(void) -{ - register_sysctl_init("vm", memory_failure_table); - return 0; -} -late_initcall(memory_failure_sysctl_init); -#endif /* CONFIG_SYSCTL */ - /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -2441,6 +2432,8 @@ static int __init memory_failure_init(void) INIT_WORK(&mf_cpu->work, memory_failure_work_func); } + register_sysctl_init("vm", memory_failure_table); + return 0; } core_initcall(memory_failure_init); diff --git a/mm/memory.c b/mm/memory.c index 146bb94764f8..f2ae86338545 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5590,7 +5590,6 @@ EXPORT_SYMBOL_GPL(generic_access_phys); int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - struct vm_area_struct *vma; void *old_buf = buf; int write = gup_flags & FOLL_WRITE; @@ -5599,29 +5598,30 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, /* ignore errors, just check how much was successfully transferred */ while (len) { - int bytes, ret, offset; + int bytes, offset; void *maddr; - struct page *page = NULL; + struct vm_area_struct *vma; + struct page *page = get_user_page_vma_remote(mm, addr, + gup_flags, &vma); - ret = get_user_pages_remote(mm, addr, 1, - gup_flags, &page, &vma, NULL); - if (ret <= 0) { + if (IS_ERR_OR_NULL(page)) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; #else + int res = 0; + /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ - vma = vma_lookup(mm, addr); if (!vma) break; if (vma->vm_ops && vma->vm_ops->access) - ret = vma->vm_ops->access(vma, addr, buf, + res = vma->vm_ops->access(vma, addr, buf, len, write); - if (ret <= 0) + if (res <= 0) break; - bytes = ret; + bytes = res; #endif } else { bytes = len; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8e0fa209d533..9061ac69b1b6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -325,7 +325,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, } if (check_pfn_span(pfn, nr_pages)) { - WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1); + WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); return -EINVAL; } @@ -525,7 +525,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, map_offset = vmem_altmap_offset(altmap); if (check_pfn_span(pfn, nr_pages)) { - WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1); + WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); return; } diff --git a/mm/migrate.c b/mm/migrate.c index 01cac26a3127..cb292d2a90ce 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -692,37 +692,32 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode) { struct buffer_head *bh = head; + struct buffer_head *failed_bh; - /* Simple case, sync compaction */ - if (mode != MIGRATE_ASYNC) { - do { - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); - - return true; - } - - /* async case, we cannot block on lock_buffer so use trylock_buffer */ do { if (!trylock_buffer(bh)) { - /* - * We failed to lock the buffer and cannot stall in - * async migration. Release the taken locks - */ - struct buffer_head *failed_bh = bh; - bh = head; - while (bh != failed_bh) { - unlock_buffer(bh); - bh = bh->b_this_page; - } - return false; + if (mode == MIGRATE_ASYNC) + goto unlock; + if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh)) + goto unlock; + lock_buffer(bh); } bh = bh->b_this_page; } while (bh != head); + return true; + +unlock: + /* We failed to lock the buffer and cannot stall. */ + failed_bh = bh; + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + bh = bh->b_this_page; + } + + return false; } static int __buffer_migrate_folio(struct address_space *mapping, @@ -1156,6 +1151,14 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page if (current->flags & PF_MEMALLOC) goto out; + /* + * In "light" mode, we can wait for transient locks (eg + * inserting a page into the page table), but it's not + * worth waiting for I/O. + */ + if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src)) + goto out; + folio_lock(src); } locked = true; @@ -1614,13 +1617,10 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, int nr_pass) { int retry = 1; - int large_retry = 1; int thp_retry = 1; int nr_failed = 0; int nr_retry_pages = 0; - int nr_large_failed = 0; int pass = 0; - bool is_large = false; bool is_thp = false; struct folio *folio, *folio2, *dst = NULL, *dst2; int rc, rc_saved = 0, nr_pages; @@ -1631,20 +1631,13 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC && !list_empty(from) && !list_is_singular(from)); - for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) { + for (pass = 0; pass < nr_pass && retry; pass++) { retry = 0; - large_retry = 0; thp_retry = 0; nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { - /* - * Large folio statistics is based on the source large - * folio. Capture required information that might get - * lost during migration. - */ - is_large = folio_test_large(folio); - is_thp = is_large && folio_test_pmd_mappable(folio); + is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); @@ -1660,7 +1653,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, * list is processed. */ if (!thp_migration_supported() && is_thp) { - nr_large_failed++; + nr_failed++; stats->nr_thp_failed++; if (!try_split_folio(folio, split_folios)) { stats->nr_thp_split++; @@ -1688,38 +1681,33 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, * When memory is low, don't bother to try to migrate * other folios, move unmapped folios, then exit. */ - if (is_large) { - nr_large_failed++; - stats->nr_thp_failed += is_thp; - /* Large folio NUMA faulting doesn't split to retry. */ - if (!nosplit) { - int ret = try_split_folio(folio, split_folios); - - if (!ret) { - stats->nr_thp_split += is_thp; - break; - } else if (reason == MR_LONGTERM_PIN && - ret == -EAGAIN) { - /* - * Try again to split large folio to - * mitigate the failure of longterm pinning. - */ - large_retry++; - thp_retry += is_thp; - nr_retry_pages += nr_pages; - /* Undo duplicated failure counting. */ - nr_large_failed--; - stats->nr_thp_failed -= is_thp; - break; - } + nr_failed++; + stats->nr_thp_failed += is_thp; + /* Large folio NUMA faulting doesn't split to retry. */ + if (folio_test_large(folio) && !nosplit) { + int ret = try_split_folio(folio, split_folios); + + if (!ret) { + stats->nr_thp_split += is_thp; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { + /* + * Try again to split large folio to + * mitigate the failure of longterm pinning. + */ + retry++; + thp_retry += is_thp; + nr_retry_pages += nr_pages; + /* Undo duplicated failure counting. */ + nr_failed--; + stats->nr_thp_failed -= is_thp; + break; } - } else { - nr_failed++; } stats->nr_failed_pages += nr_pages + nr_retry_pages; /* nr_failed isn't updated for not used */ - nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; rc_saved = rc; if (list_empty(&unmap_folios)) @@ -1727,12 +1715,8 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, else goto move; case -EAGAIN: - if (is_large) { - large_retry++; - thp_retry += is_thp; - } else { - retry++; - } + retry++; + thp_retry += is_thp; nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: @@ -1750,20 +1734,14 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, * removed from migration folio list and not * retried in the next outer loop. */ - if (is_large) { - nr_large_failed++; - stats->nr_thp_failed += is_thp; - } else { - nr_failed++; - } - + nr_failed++; + stats->nr_thp_failed += is_thp; stats->nr_failed_pages += nr_pages; break; } } } nr_failed += retry; - nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; move: @@ -1771,17 +1749,15 @@ move: try_to_unmap_flush(); retry = 1; - for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) { + for (pass = 0; pass < nr_pass && retry; pass++) { retry = 0; - large_retry = 0; thp_retry = 0; nr_retry_pages = 0; dst = list_first_entry(&dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { - is_large = folio_test_large(folio); - is_thp = is_large && folio_test_pmd_mappable(folio); + is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); @@ -1797,12 +1773,8 @@ move: */ switch(rc) { case -EAGAIN: - if (is_large) { - large_retry++; - thp_retry += is_thp; - } else { - retry++; - } + retry++; + thp_retry += is_thp; nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: @@ -1810,13 +1782,8 @@ move: stats->nr_thp_succeeded += is_thp; break; default: - if (is_large) { - nr_large_failed++; - stats->nr_thp_failed += is_thp; - } else { - nr_failed++; - } - + nr_failed++; + stats->nr_thp_failed += is_thp; stats->nr_failed_pages += nr_pages; break; } @@ -1825,14 +1792,10 @@ move: } } nr_failed += retry; - nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; - if (rc_saved) - rc = rc_saved; - else - rc = nr_failed + nr_large_failed; + rc = rc_saved ? : nr_failed; out: /* Cleanup remaining folios */ dst = list_first_entry(&dst_folios, struct folio, lru); diff --git a/mm/mm_init.c b/mm/mm_init.c index 7f7f9c677854..10bf560302c4 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -259,6 +259,8 @@ static int __init cmdline_parse_core(char *p, unsigned long *core, return 0; } +bool mirrored_kernelcore __initdata_memblock; + /* * kernelcore=size sets the amount of memory for use for allocations that * cannot be reclaimed or migrated. @@ -2328,6 +2330,28 @@ void __init init_cma_reserved_pageblock(struct page *page) } #endif +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = pageblock_end_pfn(block_start_pfn); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + cond_resched(); + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + void __init page_alloc_init_late(void) { struct zone *zone; @@ -2368,6 +2392,8 @@ void __init page_alloc_init_late(void) /* Initialize page ext after all struct pages are initialized. */ if (deferred_struct_pages) page_ext_init(); + + page_alloc_sysctl_init(); } #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES @@ -2541,6 +2567,12 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, __free_pages_core(page, order); } +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); +EXPORT_SYMBOL(init_on_alloc); + +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); +EXPORT_SYMBOL(init_on_free); + static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) diff --git a/mm/mmap.c b/mm/mmap.c index 13678edaa22c..877696464c09 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -299,62 +299,44 @@ out: return origbrk; } -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) -extern void mt_validate(struct maple_tree *mt); -extern void mt_dump(const struct maple_tree *mt); - -/* Validate the maple tree */ -static void validate_mm_mt(struct mm_struct *mm) -{ - struct maple_tree *mt = &mm->mm_mt; - struct vm_area_struct *vma_mt; - - MA_STATE(mas, mt, 0, 0); - - mt_validate(&mm->mm_mt); - mas_for_each(&mas, vma_mt, ULONG_MAX) { - if ((vma_mt->vm_start != mas.index) || - (vma_mt->vm_end - 1 != mas.last)) { - pr_emerg("issue in %s\n", current->comm); - dump_stack(); - dump_vma(vma_mt); - pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, - mas.index, mas.last); - pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, - vma_mt->vm_start, vma_mt->vm_end); - - mt_dump(mas.tree); - if (vma_mt->vm_end != mas.last + 1) { - pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", - mm, vma_mt->vm_start, vma_mt->vm_end, - mas.index, mas.last); - mt_dump(mas.tree); - } - VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); - if (vma_mt->vm_start != mas.index) { - pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", - mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); - mt_dump(mas.tree); - } - VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); - } - } -} - +#if defined(CONFIG_DEBUG_VM) static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); - validate_mm_mt(mm); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mt_validate(&mm->mm_mt); +#endif - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; +#endif + unsigned long vmi_start, vmi_end; + bool warn = 0; + + vmi_start = vma_iter_addr(&vmi); + vmi_end = vma_iter_end(&vmi); + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) + warn = 1; + + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) + warn = 1; + + if (warn) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma); + pr_emerg("tree range: %px start %lx end %lx\n", vma, + vmi_start, vmi_end - 1); + vma_iter_dump_tree(&vmi); + } +#ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) @@ -365,16 +347,15 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } -#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ -#define validate_mm_mt(root) do { } while (0) +#else /* !CONFIG_DEBUG_VM */ #define validate_mm(mm) do { } while (0) -#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ +#endif /* CONFIG_DEBUG_VM */ /* * vma has some anon_vma assigned, and is already inserted on that @@ -1475,6 +1456,48 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ +static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) +{ + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); +} + +static bool vma_is_shared_writable(struct vm_area_struct *vma) +{ + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == + (VM_WRITE | VM_SHARED); +} + +static bool vma_fs_can_writeback(struct vm_area_struct *vma) +{ + /* No managed pages to writeback. */ + if (vma->vm_flags & VM_PFNMAP) + return false; + + return vma->vm_file && vma->vm_file->f_mapping && + mapping_can_writeback(vma->vm_file->f_mapping); +} + +/* + * Does this VMA require the underlying folios to have their dirty state + * tracked? + */ +bool vma_needs_dirty_tracking(struct vm_area_struct *vma) +{ + /* Only shared, writable VMAs require dirty tracking. */ + if (!vma_is_shared_writable(vma)) + return false; + + /* Does the filesystem need to be notified? */ + if (vm_ops_needs_writenotify(vma->vm_ops)) + return true; + + /* + * Even if the filesystem doesn't indicate a need for writenotify, if it + * can writeback, dirty tracking is still required. + */ + return vma_fs_can_writeback(vma); +} + /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot @@ -1483,21 +1506,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) */ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { - vm_flags_t vm_flags = vma->vm_flags; - const struct vm_operations_struct *vm_ops = vma->vm_ops; - /* If it was private or non-writable, the write bit is already clear */ - if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + if (!vma_is_shared_writable(vma)) return 0; /* The backer wishes to know when pages are first written to? */ - if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite)) + if (vm_ops_needs_writenotify(vma->vm_ops)) return 1; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != - pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return 0; /* @@ -1511,13 +1531,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) if (userfaultfd_wp(vma)) return 1; - /* Specialty mapping? */ - if (vm_flags & VM_PFNMAP) - return 0; - /* Can the mapping track the dirty pages? */ - return vma->vm_file && vma->vm_file->f_mapping && - mapping_can_writeback(vma->vm_file->f_mapping); + return vma_fs_can_writeback(vma); } /* @@ -2234,7 +2249,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); @@ -2292,7 +2307,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Success. */ if (new_below) vma_next(vmi); - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); return 0; out_free_mpol: @@ -2301,7 +2316,7 @@ out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); return err; } @@ -2410,7 +2425,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, #endif } - next = vma_next(vmi); + if (vma_iter_end(vmi) > end) + next = vma_iter_load(vmi); + + if (!next) + next = vma_next(vmi); + if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2623,6 +2643,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } cannot_expand: + if (prev) + vma_iter_next_range(&vmi); + /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but @@ -2936,7 +2959,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, arch_unmap(mm, start, end); ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); - validate_mm_mt(mm); + validate_mm(mm); return ret; } @@ -2958,7 +2981,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm = current->mm; struct vma_prepare vp; - validate_mm_mt(mm); + validate_mm(mm); /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. @@ -3199,7 +3222,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); - validate_mm_mt(mm); + validate_mm(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3258,7 +3281,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, goto out_vma_link; *need_rmap_locks = false; } - validate_mm_mt(mm); + validate_mm(mm); return new_vma; out_vma_link: @@ -3274,7 +3297,7 @@ out_free_mempol: out_free_vma: vm_area_free(new_vma); out: - validate_mm_mt(mm); + validate_mm(mm); return NULL; } @@ -3411,7 +3434,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - validate_mm_mt(mm); + validate_mm(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3434,12 +3457,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); - validate_mm_mt(mm); + validate_mm(mm); return vma; out: vm_area_free(vma); - validate_mm_mt(mm); + validate_mm(mm); return ERR_PTR(ret); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 044e1eed720e..612b5597d3af 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1130,12 +1130,10 @@ bool out_of_memory(struct oom_control *oc) /* * The OOM killer does not compensate for IO-less reclaim. - * pagefault_out_of_memory lost its gfp context so we have to - * make sure exclude 0 mask - all other users should have at least - * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to - * invoke the OOM killer even if it is a GFP_NOFS allocation. + * But mem_cgroup_oom() has to invoke the OOM killer even + * if it is a GFP_NOFS allocation. */ - if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) + if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) return true; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47421bedc12b..1023f41de2fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,12 +18,8 @@ #include <linux/stddef.h> #include <linux/mm.h> #include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/swapops.h> #include <linux/interrupt.h> -#include <linux/pagemap.h> #include <linux/jiffies.h> -#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kasan.h> @@ -31,8 +27,6 @@ #include <linux/module.h> #include <linux/suspend.h> #include <linux/pagevec.h> -#include <linux/blkdev.h> -#include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/oom.h> #include <linux/topology.h> @@ -41,19 +35,10 @@ #include <linux/cpuset.h> #include <linux/memory_hotplug.h> #include <linux/nodemask.h> -#include <linux/vmalloc.h> #include <linux/vmstat.h> -#include <linux/mempolicy.h> -#include <linux/memremap.h> -#include <linux/stop_machine.h> -#include <linux/random.h> #include <linux/sort.h> #include <linux/pfn.h> -#include <linux/backing-dev.h> #include <linux/fault-inject.h> -#include <linux/page-isolation.h> -#include <linux/debugobjects.h> -#include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> #include <trace/events/oom.h> @@ -61,12 +46,9 @@ #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> -#include <linux/hugetlb.h> -#include <linux/sched/rt.h> #include <linux/sched/mm.h> #include <linux/page_owner.h> #include <linux/page_table_check.h> -#include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/lockdep.h> @@ -74,13 +56,10 @@ #include <linux/psi.h> #include <linux/khugepaged.h> #include <linux/delayacct.h> -#include <asm/sections.h> -#include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" #include "shuffle.h" #include "page_reporting.h" -#include "swap.h" /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ typedef int __bitwise fpi_t; @@ -227,18 +206,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); -atomic_long_t _totalram_pages __read_mostly; -EXPORT_SYMBOL(_totalram_pages); -unsigned long totalreserve_pages __read_mostly; -unsigned long totalcma_pages __read_mostly; - -int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); -EXPORT_SYMBOL(init_on_alloc); - -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); -EXPORT_SYMBOL(init_on_free); /* * A cached value of the page's pageblock's migratetype, used when the page is @@ -258,44 +226,6 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype) page->index = migratetype; } -#ifdef CONFIG_PM_SLEEP -/* - * The following functions are used by the suspend/hibernate code to temporarily - * change gfp_allowed_mask in order to avoid using I/O during memory allocations - * while devices are suspended. To avoid races with the suspend/hibernate code, - * they should always be called with system_transition_mutex held - * (gfp_allowed_mask also should only be modified with system_transition_mutex - * held, unless the suspend/hibernate code is guaranteed not to run in parallel - * with that modification). - */ - -static gfp_t saved_gfp_mask; - -void pm_restore_gfp_mask(void) -{ - WARN_ON(!mutex_is_locked(&system_transition_mutex)); - if (saved_gfp_mask) { - gfp_allowed_mask = saved_gfp_mask; - saved_gfp_mask = 0; - } -} - -void pm_restrict_gfp_mask(void) -{ - WARN_ON(!mutex_is_locked(&system_transition_mutex)); - WARN_ON(saved_gfp_mask); - saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); -} - -bool pm_suspended_storage(void) -{ - if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) - return false; - return true; -} -#endif /* CONFIG_PM_SLEEP */ - #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif @@ -371,10 +301,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -int watermark_boost_factor __read_mostly = 15000; -int watermark_scale_factor = 10; - -bool mirrored_kernelcore __initdata_memblock; +static int watermark_boost_factor __read_mostly = 15000; +static int watermark_scale_factor = 10; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -550,13 +478,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) return ret; } -static int page_is_consistent(struct zone *zone, struct page *page) -{ - if (zone != page_zone(page)) - return 0; - - return 1; -} /* * Temporary debugging check for pages not lying within a given zone. */ @@ -564,7 +485,7 @@ static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; - if (!page_is_consistent(zone, page)) + if (zone != page_zone(page)) return 1; return 0; @@ -704,75 +625,6 @@ void destroy_large_folio(struct folio *folio) compound_page_dtors[dtor](&folio->page); } -#ifdef CONFIG_DEBUG_PAGEALLOC -unsigned int _debug_guardpage_minorder; - -bool _debug_pagealloc_enabled_early __read_mostly - = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); -EXPORT_SYMBOL(_debug_pagealloc_enabled_early); -DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); -EXPORT_SYMBOL(_debug_pagealloc_enabled); - -DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); - -static int __init early_debug_pagealloc(char *buf) -{ - return kstrtobool(buf, &_debug_pagealloc_enabled_early); -} -early_param("debug_pagealloc", early_debug_pagealloc); - -static int __init debug_guardpage_minorder_setup(char *buf) -{ - unsigned long res; - - if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - pr_err("Bad debug_guardpage_minorder value\n"); - return 0; - } - _debug_guardpage_minorder = res; - pr_info("Setting debug_guardpage_minorder to %lu\n", res); - return 0; -} -early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); - -static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) -{ - if (!debug_guardpage_enabled()) - return false; - - if (order >= debug_guardpage_minorder()) - return false; - - __SetPageGuard(page); - INIT_LIST_HEAD(&page->buddy_list); - set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); - - return true; -} - -static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) -{ - if (!debug_guardpage_enabled()) - return; - - __ClearPageGuard(page); - - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); -} -#else -static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } -static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} -#endif - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); @@ -1131,6 +983,11 @@ static inline bool free_page_is_bad(struct page *page) return true; } +static inline bool is_check_pages_enabled(void) +{ + return static_branch_unlikely(&check_pages_enabled); +} + static int free_tail_page_prepare(struct page *head_page, struct page *page) { struct folio *folio = (struct folio *)head_page; @@ -1142,7 +999,7 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) */ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); - if (!static_branch_unlikely(&check_pages_enabled)) { + if (!is_check_pages_enabled()) { ret = 0; goto out; } @@ -1521,7 +1378,7 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, /* end_pfn is one past the range we are checking */ end_pfn--; - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + if (!pfn_valid(end_pfn)) return NULL; start_page = pfn_to_online_page(start_pfn); @@ -1540,33 +1397,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, return start_page; } -void set_zone_contiguous(struct zone *zone) -{ - unsigned long block_start_pfn = zone->zone_start_pfn; - unsigned long block_end_pfn; - - block_end_pfn = pageblock_end_pfn(block_start_pfn); - for (; block_start_pfn < zone_end_pfn(zone); - block_start_pfn = block_end_pfn, - block_end_pfn += pageblock_nr_pages) { - - block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); - - if (!__pageblock_pfn_to_page(block_start_pfn, - block_end_pfn, zone)) - return; - cond_resched(); - } - - /* We confirm that there is no hole */ - zone->contiguous = true; -} - -void clear_zone_contiguous(struct zone *zone) -{ - zone->contiguous = false; -} - /* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression @@ -2275,6 +2105,43 @@ do_steal: } +#ifdef CONFIG_CMA +/* + * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via + * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok + * again without ALLOC_CMA to see if to use CMA first. + */ +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags) +{ + unsigned long watermark; + bool cma_first = false; + + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */ + if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) { + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2); + } else { + /* + * watermark failed means UNMOVABLE & RECLAIMBLE is not enough + * now, we should use cma first to keep them stay around the + * corresponding watermark + */ + cma_first = true; + } + return cma_first; +} +#else +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags) +{ + return false; +} +#endif /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. @@ -2288,12 +2155,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, if (IS_ENABLED(CONFIG_CMA)) { /* * Balance movable allocations between regular and CMA areas by - * allocating from CMA when over half of the zone's free memory - * is in the CMA area. + * allocating from CMA base on judging zone_watermark_ok again + * to see if the latest check got pass via the help of CMA */ if (alloc_flags & ALLOC_CMA && - zone_page_state(zone, NR_FREE_CMA_PAGES) > - zone_page_state(zone, NR_FREE_PAGES) / 2) { + use_cma_first(zone, order, alloc_flags)) { page = __rmqueue_cma_fallback(zone, order); if (page) return page; @@ -2501,61 +2367,6 @@ void drain_all_pages(struct zone *zone) __drain_all_pages(zone, false); } -#ifdef CONFIG_HIBERNATION - -/* - * Touch the watchdog for every WD_PAGE_COUNT pages. - */ -#define WD_PAGE_COUNT (128*1024) - -void mark_free_pages(struct zone *zone) -{ - unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; - unsigned long flags; - unsigned int order, t; - struct page *page; - - if (zone_is_empty(zone)) - return; - - spin_lock_irqsave(&zone->lock, flags); - - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } - - if (page_zone(page) != zone) - continue; - - if (!swsusp_page_is_forbidden(page)) - swsusp_unset_page_free(page); - } - - for_each_migratetype_order(order, t) { - list_for_each_entry(page, - &zone->free_area[order].free_list[t], buddy_list) { - unsigned long i; - - pfn = page_to_pfn(page); - for (i = 0; i < (1UL << order); i++) { - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } - swsusp_set_page_free(pfn_to_page(pfn + i)); - } - } - } - spin_unlock_irqrestore(&zone->lock, flags); -} -#endif /* CONFIG_PM */ - static bool free_unref_page_prepare(struct page *page, unsigned long pfn, unsigned int order) { @@ -3052,7 +2863,8 @@ struct page *rmqueue(struct zone *preferred_zone, out: /* Separate test+clear to avoid unnecessary atomics */ - if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { + if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) + && (alloc_flags & ALLOC_KSWAPD)) { clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); wakeup_kswapd(zone, 0, 0, zone_idx(zone)); } @@ -3061,80 +2873,6 @@ out: return page; } -#ifdef CONFIG_FAIL_PAGE_ALLOC - -static struct { - struct fault_attr attr; - - bool ignore_gfp_highmem; - bool ignore_gfp_reclaim; - u32 min_order; -} fail_page_alloc = { - .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_reclaim = true, - .ignore_gfp_highmem = true, - .min_order = 1, -}; - -static int __init setup_fail_page_alloc(char *str) -{ - return setup_fault_attr(&fail_page_alloc.attr, str); -} -__setup("fail_page_alloc=", setup_fail_page_alloc); - -static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - int flags = 0; - - if (order < fail_page_alloc.min_order) - return false; - if (gfp_mask & __GFP_NOFAIL) - return false; - if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) - return false; - if (fail_page_alloc.ignore_gfp_reclaim && - (gfp_mask & __GFP_DIRECT_RECLAIM)) - return false; - - /* See comment in __should_failslab() */ - if (gfp_mask & __GFP_NOWARN) - flags |= FAULT_NOWARN; - - return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); -} - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - -static int __init fail_page_alloc_debugfs(void) -{ - umode_t mode = S_IFREG | 0600; - struct dentry *dir; - - dir = fault_create_debugfs_attr("fail_page_alloc", NULL, - &fail_page_alloc.attr); - - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_reclaim); - debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem); - debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); - - return 0; -} - -late_initcall(fail_page_alloc_debugfs); - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ - -#else /* CONFIG_FAIL_PAGE_ALLOC */ - -static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - return false; -} - -#endif /* CONFIG_FAIL_PAGE_ALLOC */ - noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return __should_fail_alloc_page(gfp_mask, order); @@ -5137,383 +4875,6 @@ unsigned long nr_free_buffer_pages(void) } EXPORT_SYMBOL_GPL(nr_free_buffer_pages); -static inline void show_node(struct zone *zone) -{ - if (IS_ENABLED(CONFIG_NUMA)) - printk("Node %d ", zone_to_nid(zone)); -} - -long si_mem_available(void) -{ - long available; - unsigned long pagecache; - unsigned long wmark_low = 0; - unsigned long pages[NR_LRU_LISTS]; - unsigned long reclaimable; - struct zone *zone; - int lru; - - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); - - for_each_zone(zone) - wmark_low += low_wmark_pages(zone); - - /* - * Estimate the amount of memory available for userspace allocations, - * without causing swapping or OOM. - */ - available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; - - /* - * Not all the page cache can be freed, otherwise the system will - * start swapping or thrashing. Assume at least half of the page - * cache, or the low watermark worth of cache, needs to stay. - */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; - pagecache -= min(pagecache / 2, wmark_low); - available += pagecache; - - /* - * Part of the reclaimable slab and other kernel memory consists of - * items that are in use, and cannot be freed. Cap this estimate at the - * low watermark. - */ - reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - available += reclaimable - min(reclaimable / 2, wmark_low); - - if (available < 0) - available = 0; - return available; -} -EXPORT_SYMBOL_GPL(si_mem_available); - -void si_meminfo(struct sysinfo *val) -{ - val->totalram = totalram_pages(); - val->sharedram = global_node_page_state(NR_SHMEM); - val->freeram = global_zone_page_state(NR_FREE_PAGES); - val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages(); - val->freehigh = nr_free_highpages(); - val->mem_unit = PAGE_SIZE; -} - -EXPORT_SYMBOL(si_meminfo); - -#ifdef CONFIG_NUMA -void si_meminfo_node(struct sysinfo *val, int nid) -{ - int zone_type; /* needs to be signed */ - unsigned long managed_pages = 0; - unsigned long managed_highpages = 0; - unsigned long free_highpages = 0; - pg_data_t *pgdat = NODE_DATA(nid); - - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - val->totalram = managed_pages; - val->sharedram = node_page_state(pgdat, NR_SHMEM); - val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); -#ifdef CONFIG_HIGHMEM - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - struct zone *zone = &pgdat->node_zones[zone_type]; - - if (is_highmem(zone)) { - managed_highpages += zone_managed_pages(zone); - free_highpages += zone_page_state(zone, NR_FREE_PAGES); - } - } - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#else - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#endif - val->mem_unit = PAGE_SIZE; -} -#endif - -/* - * Determine whether the node should be displayed or not, depending on whether - * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). - */ -static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) -{ - if (!(flags & SHOW_MEM_FILTER_NODES)) - return false; - - /* - * no node mask - aka implicit memory numa policy. Do not bother with - * the synchronization - read_mems_allowed_begin - because we do not - * have to be precise here. - */ - if (!nodemask) - nodemask = &cpuset_current_mems_allowed; - - return !node_isset(nid, *nodemask); -} - -static void show_migration_types(unsigned char type) -{ - static const char types[MIGRATE_TYPES] = { - [MIGRATE_UNMOVABLE] = 'U', - [MIGRATE_MOVABLE] = 'M', - [MIGRATE_RECLAIMABLE] = 'E', - [MIGRATE_HIGHATOMIC] = 'H', -#ifdef CONFIG_CMA - [MIGRATE_CMA] = 'C', -#endif -#ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = 'I', -#endif - }; - char tmp[MIGRATE_TYPES + 1]; - char *p = tmp; - int i; - - for (i = 0; i < MIGRATE_TYPES; i++) { - if (type & (1 << i)) - *p++ = types[i]; - } - - *p = '\0'; - printk(KERN_CONT "(%s) ", tmp); -} - -static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) -{ - int zone_idx; - for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) - if (zone_managed_pages(pgdat->node_zones + zone_idx)) - return true; - return false; -} - -/* - * Show free area list (used inside shift_scroll-lock stuff) - * We also calculate the percentage fragmentation. We do this by counting the - * memory on each free list with the exception of the first item on the list. - * - * Bits in @filter: - * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's - * cpuset. - */ -void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) -{ - unsigned long free_pcp = 0; - int cpu, nid; - struct zone *zone; - pg_data_t *pgdat; - - for_each_populated_zone(zone) { - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - - for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; - } - - printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" - " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu\n" - " slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu\n" - " sec_pagetables:%lu bounce:%lu\n" - " kernel_misc_reclaimable:%lu\n" - " free:%lu free_pcp:%lu free_cma:%lu\n", - global_node_page_state(NR_ACTIVE_ANON), - global_node_page_state(NR_INACTIVE_ANON), - global_node_page_state(NR_ISOLATED_ANON), - global_node_page_state(NR_ACTIVE_FILE), - global_node_page_state(NR_INACTIVE_FILE), - global_node_page_state(NR_ISOLATED_FILE), - global_node_page_state(NR_UNEVICTABLE), - global_node_page_state(NR_FILE_DIRTY), - global_node_page_state(NR_WRITEBACK), - global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), - global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), - global_node_page_state(NR_FILE_MAPPED), - global_node_page_state(NR_SHMEM), - global_node_page_state(NR_PAGETABLE), - global_node_page_state(NR_SECONDARY_PAGETABLE), - global_zone_page_state(NR_BOUNCE), - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), - global_zone_page_state(NR_FREE_PAGES), - free_pcp, - global_zone_page_state(NR_FREE_CMA_PAGES)); - - for_each_online_pgdat(pgdat) { - if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) - continue; - if (!node_has_managed_zones(pgdat, max_zone_idx)) - continue; - - printk("Node %d" - " active_anon:%lukB" - " inactive_anon:%lukB" - " active_file:%lukB" - " inactive_file:%lukB" - " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" - " mapped:%lukB" - " dirty:%lukB" - " writeback:%lukB" - " shmem:%lukB" -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - " shmem_thp: %lukB" - " shmem_pmdmapped: %lukB" - " anon_thp: %lukB" -#endif - " writeback_tmp:%lukB" - " kernel_stack:%lukB" -#ifdef CONFIG_SHADOW_CALL_STACK - " shadow_call_stack:%lukB" -#endif - " pagetables:%lukB" - " sec_pagetables:%lukB" - " all_unreclaimable? %s" - "\n", - pgdat->node_id, - K(node_page_state(pgdat, NR_ACTIVE_ANON)), - K(node_page_state(pgdat, NR_INACTIVE_ANON)), - K(node_page_state(pgdat, NR_ACTIVE_FILE)), - K(node_page_state(pgdat, NR_INACTIVE_FILE)), - K(node_page_state(pgdat, NR_UNEVICTABLE)), - K(node_page_state(pgdat, NR_ISOLATED_ANON)), - K(node_page_state(pgdat, NR_ISOLATED_FILE)), - K(node_page_state(pgdat, NR_FILE_MAPPED)), - K(node_page_state(pgdat, NR_FILE_DIRTY)), - K(node_page_state(pgdat, NR_WRITEBACK)), - K(node_page_state(pgdat, NR_SHMEM)), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(node_page_state(pgdat, NR_SHMEM_THPS)), - K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), - K(node_page_state(pgdat, NR_ANON_THPS)), -#endif - K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - node_page_state(pgdat, NR_KERNEL_STACK_KB), -#ifdef CONFIG_SHADOW_CALL_STACK - node_page_state(pgdat, NR_KERNEL_SCS_KB), -#endif - K(node_page_state(pgdat, NR_PAGETABLE)), - K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? - "yes" : "no"); - } - - for_each_populated_zone(zone) { - int i; - - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - - free_pcp = 0; - for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; - - show_node(zone); - printk(KERN_CONT - "%s" - " free:%lukB" - " boost:%lukB" - " min:%lukB" - " low:%lukB" - " high:%lukB" - " reserved_highatomic:%luKB" - " active_anon:%lukB" - " inactive_anon:%lukB" - " active_file:%lukB" - " inactive_file:%lukB" - " unevictable:%lukB" - " writepending:%lukB" - " present:%lukB" - " managed:%lukB" - " mlocked:%lukB" - " bounce:%lukB" - " free_pcp:%lukB" - " local_pcp:%ukB" - " free_cma:%lukB" - "\n", - zone->name, - K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->watermark_boost), - K(min_wmark_pages(zone)), - K(low_wmark_pages(zone)), - K(high_wmark_pages(zone)), - K(zone->nr_reserved_highatomic), - K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), - K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), - K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), - K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), - K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), - K(zone->present_pages), - K(zone_managed_pages(zone)), - K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_BOUNCE)), - K(free_pcp), - K(this_cpu_read(zone->per_cpu_pageset->count)), - K(zone_page_state(zone, NR_FREE_CMA_PAGES))); - printk("lowmem_reserve[]:"); - for (i = 0; i < MAX_NR_ZONES; i++) - printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); - printk(KERN_CONT "\n"); - } - - for_each_populated_zone(zone) { - unsigned int order; - unsigned long nr[MAX_ORDER + 1], flags, total = 0; - unsigned char types[MAX_ORDER + 1]; - - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - show_node(zone); - printk(KERN_CONT "%s: ", zone->name); - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[order]; - int type; - - nr[order] = area->nr_free; - total += nr[order] << order; - - types[order] = 0; - for (type = 0; type < MIGRATE_TYPES; type++) { - if (!free_area_empty(area, type)) - types[order] |= 1 << type; - } - } - spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { - printk(KERN_CONT "%lu*%lukB ", - nr[order], K(1UL) << order); - if (nr[order]) - show_migration_types(types[order]); - } - printk(KERN_CONT "= %lukB\n", K(total)); - } - - for_each_online_node(nid) { - if (show_mem_node_skip(filter, nid, nodemask)) - continue; - hugetlb_show_meminfo_node(nid); - } - - printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); - - show_swap_cache_info(); -} - static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; @@ -5560,12 +4921,12 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -char numa_zonelist_order[] = "Node"; - +static char numa_zonelist_order[] = "Node"; +#define NUMA_ZONELIST_ORDER_LEN 16 /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(struct ctl_table *table, int write, +static int numa_zonelist_order_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { if (write) @@ -5573,7 +4934,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, return proc_dostring(table, write, buffer, length, ppos); } - static int node_load[MAX_NUMNODES]; /** @@ -5976,6 +5336,7 @@ static int zone_batchsize(struct zone *zone) #endif } +static int percpu_pagelist_high_fraction; static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU @@ -6505,7 +5866,7 @@ postcore_initcall(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, +static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6521,7 +5882,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, +static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6551,7 +5912,7 @@ static void setup_min_unmapped_ratio(void) } -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6578,7 +5939,7 @@ static void setup_min_slab_ratio(void) sysctl_min_slab_ratio) / 100; } -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6602,8 +5963,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) +static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) { int i; @@ -6623,7 +5984,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, +static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -6656,9 +6017,83 @@ out: return ret; } +static struct ctl_table page_alloc_sysctl_table[] = { + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_THREE_THOUSAND, + }, + { + .procname = "percpu_pagelist_high_fraction", + .data = &percpu_pagelist_high_fraction, + .maxlen = sizeof(percpu_pagelist_high_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +#endif + {} +}; + +void __init page_alloc_sysctl_init(void) +{ + register_sysctl_init("vm", page_alloc_sysctl_table); +} + #ifdef CONFIG_CONTIG_ALLOC -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) { @@ -6672,11 +6107,6 @@ static void alloc_contig_dump_pages(struct list_head *page_list) dump_page(page, "migration failure"); } } -#else -static inline void alloc_contig_dump_pages(struct list_head *page_list) -{ -} -#endif /* [start, end) must belong to a single zone. */ int __alloc_contig_migrate_range(struct compact_control *cc, diff --git a/mm/page_owner.c b/mm/page_owner.c index 31169b3e7f06..28c519fc9372 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -139,6 +139,7 @@ void __reset_page_owner(struct page *page, unsigned short order) int i; struct page_ext *page_ext; depot_stack_handle_t handle; + depot_stack_handle_t alloc_handle; struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); @@ -146,6 +147,9 @@ void __reset_page_owner(struct page *page, unsigned short order) if (unlikely(!page_ext)) return; + page_owner = get_page_owner(page_ext); + alloc_handle = page_owner->handle; + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); for (i = 0; i < (1 << order); i++) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -155,6 +159,7 @@ void __reset_page_owner(struct page *page, unsigned short order) page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); + stack_depot_dec_count(alloc_handle); } static inline void __set_page_owner_handle(struct page_ext *page_ext, @@ -196,6 +201,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order, return; __set_page_owner_handle(page_ext, handle, order, gfp_mask); page_ext_put(page_ext); + stack_depot_inc_count(handle); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -713,6 +719,47 @@ static const struct file_operations proc_page_owner_operations = { .llseek = lseek_page_owner, }; +static void stack_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations page_owner_stack_op = { + .start = stack_start, + .next = stack_next, + .stop = stack_stop, + .show = stack_print +}; + +static int page_owner_stack_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &page_owner_stack_op, + sizeof(unsigned long)); +} + +const struct file_operations page_owner_stack_operations = { + .open = page_owner_stack_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +unsigned long page_owner_stack_threshold; + +int page_owner_threshold_get(void *data, u64 *val) +{ + *val = page_owner_stack_threshold; + return 0; +} + +int page_owner_threshold_set(void *data, u64 val) +{ + page_owner_stack_threshold = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get, + &page_owner_threshold_set, "%llu"); + static int __init pageowner_init(void) { if (!static_branch_unlikely(&page_owner_inited)) { @@ -723,6 +770,13 @@ static int __init pageowner_init(void) debugfs_create_file("page_owner", 0400, NULL, NULL, &proc_page_owner_operations); + debugfs_create_file("page_owner_stacks", 0400, NULL, NULL, + &page_owner_stack_operations); + debugfs_create_file("page_owner_threshold", 0600, NULL, NULL, + &proc_page_owner_threshold); + + page_owner_stack_threshold = 0; + return 0; } late_initcall(pageowner_init) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 78dfaf9e8990..0523edab03a6 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -104,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr, mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, - NULL, &locked); + &locked); if (locked) mmap_read_unlock(mm); if (pinned_pages <= 0) diff --git a/mm/rmap.c b/mm/rmap.c index b42fc0389c24..ae127f60a4fb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2328,7 +2328,7 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, npages = get_user_pages_remote(mm, start, npages, FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, - pages, NULL, NULL); + pages, NULL); if (npages < 0) return npages; diff --git a/mm/secretmem.c b/mm/secretmem.c index 0b502625cd30..974b32ba8b9d 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -35,7 +35,7 @@ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK -static bool secretmem_enable __ro_after_init; +static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); diff --git a/mm/show_mem.c b/mm/show_mem.c new file mode 100644 index 000000000000..01f8e9905817 --- /dev/null +++ b/mm/show_mem.c @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generic show_mem() implementation + * + * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de> + */ + +#include <linux/blkdev.h> +#include <linux/cma.h> +#include <linux/cpuset.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/swap.h> +#include <linux/vmstat.h> + +#include "internal.h" +#include "swap.h" + +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); +unsigned long totalreserve_pages __read_mostly; +unsigned long totalcma_pages __read_mostly; + +static inline void show_node(struct zone *zone) +{ + if (IS_ENABLED(CONFIG_NUMA)) + printk("Node %d ", zone_to_nid(zone)); +} + +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += low_wmark_pages(zone); + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping or OOM. + */ + available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping or thrashing. Assume at least half of the page + * cache, or the low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. + */ + reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages(); + val->sharedram = global_node_page_state(NR_SHMEM); + val->freeram = global_zone_page_state(NR_FREE_PAGES); + val->bufferram = nr_blockdev_pages(); + val->totalhigh = totalhigh_pages(); + val->freehigh = nr_free_highpages(); + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; + unsigned long managed_highpages = 0; + unsigned long free_highpages = 0; + pg_data_t *pgdat = NODE_DATA(nid); + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); + val->totalram = managed_pages; + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); +#ifdef CONFIG_HIGHMEM + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + + if (is_highmem(zone)) { + managed_highpages += zone_managed_pages(zone); + free_highpages += zone_page_state(zone, NR_FREE_PAGES); + } + } + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; +#else + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; +#endif + val->mem_unit = PAGE_SIZE; +} +#endif + +/* + * Determine whether the node should be displayed or not, depending on whether + * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). + */ +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) +{ + if (!(flags & SHOW_MEM_FILTER_NODES)) + return false; + + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); +} + +static void show_migration_types(unsigned char type) +{ + static const char types[MIGRATE_TYPES] = { + [MIGRATE_UNMOVABLE] = 'U', + [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_HIGHATOMIC] = 'H', +#ifdef CONFIG_CMA + [MIGRATE_CMA] = 'C', +#endif +#ifdef CONFIG_MEMORY_ISOLATION + [MIGRATE_ISOLATE] = 'I', +#endif + }; + char tmp[MIGRATE_TYPES + 1]; + char *p = tmp; + int i; + + for (i = 0; i < MIGRATE_TYPES; i++) { + if (type & (1 << i)) + *p++ = types[i]; + } + + *p = '\0'; + printk(KERN_CONT "(%s) ", tmp); +} + +static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) +{ + int zone_idx; + for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) + if (zone_managed_pages(pgdat->node_zones + zone_idx)) + return true; + return false; +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + * + * Bits in @filter: + * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's + * cpuset. + */ +void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +{ + unsigned long free_pcp = 0; + int cpu, nid; + struct zone *zone; + pg_data_t *pgdat; + + for_each_populated_zone(zone) { + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; + } + + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" + " slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu\n" + " sec_pagetables:%lu bounce:%lu\n" + " kernel_misc_reclaimable:%lu\n" + " free:%lu free_pcp:%lu free_cma:%lu\n", + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM), + global_node_page_state(NR_PAGETABLE), + global_node_page_state(NR_SECONDARY_PAGETABLE), + global_zone_page_state(NR_BOUNCE), + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), + global_zone_page_state(NR_FREE_PAGES), + free_pcp, + global_zone_page_state(NR_FREE_CMA_PAGES)); + + for_each_online_pgdat(pgdat) { + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) + continue; + if (!node_has_managed_zones(pgdat, max_zone_idx)) + continue; + + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " mapped:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif + " writeback_tmp:%lukB" + " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif + " pagetables:%lukB" + " sec_pagetables:%lukB" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + K(node_page_state(pgdat, NR_FILE_MAPPED)), + K(node_page_state(pgdat, NR_FILE_DIRTY)), + K(node_page_state(pgdat, NR_WRITEBACK)), + K(node_page_state(pgdat, NR_SHMEM)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(node_page_state(pgdat, NR_SHMEM_THPS)), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), + K(node_page_state(pgdat, NR_ANON_THPS)), +#endif + K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + node_page_state(pgdat, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + node_page_state(pgdat, NR_KERNEL_SCS_KB), +#endif + K(node_page_state(pgdat, NR_PAGETABLE)), + K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); + } + + for_each_populated_zone(zone) { + int i; + + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + + free_pcp = 0; + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; + + show_node(zone); + printk(KERN_CONT + "%s" + " free:%lukB" + " boost:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " reserved_highatomic:%luKB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " writepending:%lukB" + " present:%lukB" + " managed:%lukB" + " mlocked:%lukB" + " bounce:%lukB" + " free_pcp:%lukB" + " local_pcp:%ukB" + " free_cma:%lukB" + "\n", + zone->name, + K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone->watermark_boost), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), + K(zone->nr_reserved_highatomic), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), + K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), + K(zone->present_pages), + K(zone_managed_pages(zone)), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_BOUNCE)), + K(free_pcp), + K(this_cpu_read(zone->per_cpu_pageset->count)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES))); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); + printk(KERN_CONT "\n"); + } + + for_each_populated_zone(zone) { + unsigned int order; + unsigned long nr[MAX_ORDER + 1], flags, total = 0; + unsigned char types[MAX_ORDER + 1]; + + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + show_node(zone); + printk(KERN_CONT "%s: ", zone->name); + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order <= MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + int type; + + nr[order] = area->nr_free; + total += nr[order] << order; + + types[order] = 0; + for (type = 0; type < MIGRATE_TYPES; type++) { + if (!free_area_empty(area, type)) + types[order] |= 1 << type; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + for (order = 0; order <= MAX_ORDER; order++) { + printk(KERN_CONT "%lu*%lukB ", + nr[order], K(1UL) << order); + if (nr[order]) + show_migration_types(types[order]); + } + printk(KERN_CONT "= %lukB\n", K(total)); + } + + for_each_online_node(nid) { + if (show_mem_node_skip(filter, nid, nodemask)) + continue; + hugetlb_show_meminfo_node(nid); + } + + printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); + + show_swap_cache_info(); +} + +void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +{ + unsigned long total = 0, reserved = 0, highmem = 0; + struct zone *zone; + + printk("Mem-Info:\n"); + __show_free_areas(filter, nodemask, max_zone_idx); + + for_each_populated_zone(zone) { + + total += zone->present_pages; + reserved += zone->present_pages - zone_managed_pages(zone); + + if (is_highmem(zone)) + highmem += zone->present_pages; + } + + printk("%lu pages RAM\n", total); + printk("%lu pages HighMem/MovableOnly\n", highmem); + printk("%lu pages reserved\n", reserved); +#ifdef CONFIG_CMA + printk("%lu pages cma reserved\n", totalcma_pages); +#endif +#ifdef CONFIG_MEMORY_FAILURE + printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); +#endif +} diff --git a/mm/swapfile.c b/mm/swapfile.c index 274bbf797480..c74259001d5e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -41,6 +41,7 @@ #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> +#include <linux/suspend.h> #include <asm/tlbflush.h> #include <linux/swapops.h> diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d0cd2840cf0..15efbfbb1963 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2458,7 +2458,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) + if (gfp_has_io_fs(sc->gfp_mask)) inactive >>= 3; too_many = isolated > inactive; diff --git a/mm/workingset.c b/mm/workingset.c index 817758951886..90ae785d4c9c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -255,6 +255,29 @@ static void *lru_gen_eviction(struct folio *folio) return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); } +/* + * Tests if the shadow entry is for a folio that was recently evicted. + * Fills in @memcgid, @pglist_data, @token, @workingset with the values + * unpacked from shadow. + */ +static bool lru_gen_test_recent(void *shadow, bool file, int *memcgid, + struct pglist_data **pgdat, unsigned long *token, bool *workingset) +{ + struct mem_cgroup *eviction_memcg; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + unsigned long min_seq; + + unpack_shadow(shadow, memcgid, pgdat, token, workingset); + eviction_memcg = mem_cgroup_from_id(*memcgid); + + lruvec = mem_cgroup_lruvec(eviction_memcg, *pgdat); + lrugen = &lruvec->lrugen; + + min_seq = READ_ONCE(lrugen->min_seq[file]); + return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); +} + static void lru_gen_refault(struct folio *folio, void *shadow) { int hist, tier, refs; @@ -269,23 +292,22 @@ static void lru_gen_refault(struct folio *folio, void *shadow) int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); - unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); - - if (pgdat != folio_pgdat(folio)) - return; - rcu_read_lock(); + if (!lru_gen_test_recent(shadow, type, &memcg_id, &pgdat, &token, + &workingset)) + goto unlock; + memcg = folio_memcg_rcu(folio); if (memcg_id != mem_cgroup_id(memcg)) goto unlock; + if (pgdat != folio_pgdat(folio)) + goto unlock; + lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; - min_seq = READ_ONCE(lrugen->min_seq[type]); - if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) - goto unlock; hist = lru_hist_from_seq(min_seq); /* see the comment in folio_lru_refs() */ @@ -317,6 +339,12 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } +static bool lru_gen_test_recent(void *shadow, bool file, int *memcgid, + struct pglist_data **pgdat, unsigned long *token, bool *workingset) +{ + return false; +} + static void lru_gen_refault(struct folio *folio, void *shadow) { } @@ -385,42 +413,34 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) } /** - * workingset_refault - Evaluate the refault of a previously evicted folio. - * @folio: The freshly allocated replacement folio. - * @shadow: Shadow entry of the evicted folio. - * - * Calculates and evaluates the refault distance of the previously - * evicted folio in the context of the node and the memcg whose memory - * pressure caused the eviction. + * workingset_test_recent - tests if the shadow entry is for a folio that was + * recently evicted. Also fills in @workingset with the value unpacked from + * shadow. + * @shadow: the shadow entry to be tested. + * @file: whether the corresponding folio is from the file lru. + * @workingset: where the workingset value unpacked from shadow should + * be stored. + * + * Return: true if the shadow is for a recently evicted folio; false otherwise. */ -void workingset_refault(struct folio *folio, void *shadow) +bool workingset_test_recent(void *shadow, bool file, bool *workingset) { - bool file = folio_is_file_lru(folio); struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; - struct pglist_data *pgdat; - struct mem_cgroup *memcg; - unsigned long eviction; - struct lruvec *lruvec; unsigned long refault; - bool workingset; int memcgid; - long nr; + struct pglist_data *pgdat; + unsigned long eviction; - if (lru_gen_enabled()) { - lru_gen_refault(folio, shadow); - return; - } + if (lru_gen_enabled()) + return lru_gen_test_recent(shadow, file, &memcgid, &pgdat, &eviction, + workingset); - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; - /* Flush stats (and potentially sleep) before holding RCU read lock */ - mem_cgroup_flush_stats_ratelimited(); - - rcu_read_lock(); /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. @@ -439,7 +459,8 @@ void workingset_refault(struct folio *folio, void *shadow) */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) - goto out; + return false; + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -462,20 +483,6 @@ void workingset_refault(struct folio *folio, void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; /* - * The activation decision for this folio is made at the level - * where the eviction occurred, as that is where the LRU order - * during folio reclaim is being determined. - * - * However, the cgroup that will own the folio is the one that - * is actually experiencing the refault event. - */ - nr = folio_nr_pages(folio); - memcg = folio_memcg(folio); - pgdat = folio_pgdat(folio); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - - mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether @@ -495,7 +502,54 @@ void workingset_refault(struct folio *folio, void *shadow) NR_INACTIVE_ANON); } } - if (refault_distance > workingset_size) + + return refault_distance <= workingset_size; +} + +/** + * workingset_refault - Evaluate the refault of a previously evicted folio. + * @folio: The freshly allocated replacement folio. + * @shadow: Shadow entry of the evicted folio. + * + * Calculates and evaluates the refault distance of the previously + * evicted folio in the context of the node and the memcg whose memory + * pressure caused the eviction. + */ +void workingset_refault(struct folio *folio, void *shadow) +{ + bool file = folio_is_file_lru(folio); + struct pglist_data *pgdat; + struct mem_cgroup *memcg; + struct lruvec *lruvec; + bool workingset; + long nr; + + if (lru_gen_enabled()) { + lru_gen_refault(folio, shadow); + return; + } + + /* Flush stats (and potentially sleep) before holding RCU read lock */ + mem_cgroup_flush_stats_ratelimited(); + + rcu_read_lock(); + + /* + * The activation decision for this folio is made at the level + * where the eviction occurred, as that is where the LRU order + * during folio reclaim is being determined. + * + * However, the cgroup that will own the folio is the one that + * is actually experiencing the refault event. + */ + nr = folio_nr_pages(folio); + memcg = folio_memcg(folio); + pgdat = folio_pgdat(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); + + if (!workingset_test_recent(shadow, file, &workingset)) goto out; folio_set_active(folio); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 02f7f414aade..c0d433541636 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1341,7 +1341,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, spin_unlock(&pool->lock); class = zspage_class(pool, zspage); - off = (class->size * obj_idx) & ~PAGE_MASK; + off = offset_in_page(class->size * obj_idx); local_lock(&zs_map_area.lock); area = this_cpu_ptr(&zs_map_area); @@ -1381,7 +1381,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); class = zspage_class(pool, zspage); - off = (class->size * obj_idx) & ~PAGE_MASK; + off = offset_in_page(class->size * obj_idx); area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) @@ -1438,7 +1438,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, offset = obj * class->size; nr_page = offset >> PAGE_SHIFT; - m_offset = offset & ~PAGE_MASK; + m_offset = offset_in_page(offset); m_page = get_first_page(zspage); for (i = 0; i < nr_page; i++) @@ -1548,7 +1548,7 @@ static void obj_free(int class_size, unsigned long obj, unsigned long *handle) void *vaddr; obj_to_location(obj, &f_page, &f_objidx); - f_offset = (class_size * f_objidx) & ~PAGE_MASK; + f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); @@ -1640,8 +1640,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, obj_to_location(src, &s_page, &s_objidx); obj_to_location(dst, &d_page, &d_objidx); - s_off = (class->size * s_objidx) & ~PAGE_MASK; - d_off = (class->size * d_objidx) & ~PAGE_MASK; + s_off = offset_in_page(class->size * s_objidx); + d_off = offset_in_page(class->size * d_objidx); if (s_off + class->size > PAGE_SIZE) s_size = PAGE_SIZE - s_off; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 02207e852d79..06cead2b8e34 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -103,7 +103,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) mmap_read_lock(current->mm); npgs = pin_user_pages(address, umem->npgs, - gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); + gup_flags | FOLL_LONGTERM, &umem->pgs[0]); mmap_read_unlock(current->mm); if (npgs != umem->npgs) { diff --git a/scripts/spelling.txt b/scripts/spelling.txt index f8bd6178d17b..fc7ba95e86a0 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -155,6 +155,7 @@ aquired||acquired aquisition||acquisition arbitary||arbitrary architechture||architecture +archtecture||architecture arguement||argument arguements||arguments arithmatic||arithmetic @@ -279,6 +280,7 @@ cant'||can't canot||cannot cann't||can't cannnot||cannot +capabiity||capability capabilites||capabilities capabilties||capabilities capabilty||capability @@ -426,6 +428,7 @@ cotrol||control cound||could couter||counter coutner||counter +creationg||creating cryptocraphic||cryptographic cummulative||cumulative cunter||counter @@ -492,6 +495,7 @@ destorys||destroys destroied||destroyed detabase||database deteced||detected +detecion||detection detectt||detect detroyed||destroyed develope||develop @@ -513,6 +517,7 @@ diferent||different differrence||difference diffrent||different differenciate||differentiate +diffreential||differential diffrentiate||differentiate difinition||definition digial||digital @@ -617,6 +622,7 @@ evalute||evaluate evalutes||evaluates evalution||evaluation excecutable||executable +excceed||exceed exceded||exceeded exceds||exceeds exceeed||exceed @@ -632,6 +638,7 @@ existant||existent exixt||exist exsits||exists exlcude||exclude +exlcuding||excluding exlcusive||exclusive exlusive||exclusive exmaple||example @@ -726,6 +733,8 @@ generiously||generously genereate||generate genereted||generated genric||generic +gerenal||general +geting||getting globel||global grabing||grabbing grahical||graphical @@ -899,6 +908,7 @@ iteraions||iterations iternations||iterations itertation||iteration itslef||itself +ivalid||invalid jave||java jeffies||jiffies jumpimng||jumping @@ -977,6 +987,7 @@ microprocesspr||microprocessor migrateable||migratable millenium||millennium milliseonds||milliseconds +minimim||minimum minium||minimum minimam||minimum minimun||minimum @@ -1042,6 +1053,7 @@ notifed||notified notity||notify nubmer||number numebr||number +numer||number numner||number nunber||number obtaion||obtain @@ -1061,6 +1073,7 @@ offet||offset offlaod||offload offloded||offloaded offseting||offsetting +oflload||offload omited||omitted omiting||omitting omitt||omit @@ -1105,6 +1118,7 @@ pakage||package paket||packet pallette||palette paln||plan +palne||plane paramameters||parameters paramaters||parameters paramater||parameter @@ -1181,12 +1195,14 @@ previsously||previously primative||primitive princliple||principle priorty||priority +priting||printing privilaged||privileged privilage||privilege priviledge||privilege priviledges||privileges privleges||privileges probaly||probably +probabalistic||probabilistic procceed||proceed proccesors||processors procesed||processed @@ -1460,6 +1476,7 @@ submited||submitted submition||submission succeded||succeeded suceed||succeed +succesfuly||successfully succesfully||successfully succesful||successful successed||succeeded @@ -1503,6 +1520,7 @@ symetric||symmetric synax||syntax synchonized||synchronized sychronization||synchronization +sychronously||synchronously synchronuously||synchronously syncronize||synchronize syncronized||synchronized @@ -1532,6 +1550,7 @@ threee||three threshhold||threshold thresold||threshold throught||through +tansition||transition trackling||tracking troughput||throughput trys||tries @@ -1611,6 +1630,7 @@ unneccessary||unnecessary unnecesary||unnecessary unneedingly||unnecessarily unnsupported||unsupported +unuspported||unsupported unmached||unmatched unprecise||imprecise unpriviledged||unprivileged @@ -1657,6 +1677,7 @@ verfication||verification veriosn||version verisons||versions verison||version +veritical||vertical verson||version vicefersa||vice-versa virtal||virtual @@ -1677,6 +1698,7 @@ whenver||whenever wheter||whether whe||when wierd||weird +wihout||without wiil||will wirte||write withing||within diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 31af29f669d2..ac20c0bdff9d 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -916,7 +916,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, */ mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL, NULL); + FOLL_FORCE, &page, NULL); mmap_read_unlock(bprm->mm); if (ret <= 0) return false; diff --git a/tools/testing/radix-tree/linux/init.h b/tools/testing/radix-tree/linux/init.h index 1bb0afc21309..81563c3dfce7 100644 --- a/tools/testing/radix-tree/linux/init.h +++ b/tools/testing/radix-tree/linux/init.h @@ -1 +1,2 @@ #define __init +#define __exit diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 9286d3baa12d..03539d86cdf0 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -14,6 +14,7 @@ #include "test.h" #include <stdlib.h> #include <time.h> +#include "linux/init.h" #define module_init(x) #define module_exit(x) @@ -22,7 +23,6 @@ #define dump_stack() assert(0) #include "../../../lib/maple_tree.c" -#undef CONFIG_DEBUG_MAPLE_TREE #include "../../../lib/test_maple_tree.c" #define RCU_RANGE_COUNT 1000 @@ -81,7 +81,7 @@ static void check_mas_alloc_node_count(struct ma_state *mas) * check_new_node() - Check the creation of new nodes and error path * verification. */ -static noinline void check_new_node(struct maple_tree *mt) +static noinline void __init check_new_node(struct maple_tree *mt) { struct maple_node *mn, *mn2, *mn3; @@ -455,7 +455,7 @@ static noinline void check_new_node(struct maple_tree *mt) /* * Check erasing including RCU. */ -static noinline void check_erase(struct maple_tree *mt, unsigned long index, +static noinline void __init check_erase(struct maple_tree *mt, unsigned long index, void *ptr) { MT_BUG_ON(mt, mtree_test_erase(mt, index) != ptr); @@ -465,24 +465,24 @@ static noinline void check_erase(struct maple_tree *mt, unsigned long index, #define erase_check_insert(mt, i) check_insert(mt, set[i], entry[i%2]) #define erase_check_erase(mt, i) check_erase(mt, set[i], entry[i%2]) -static noinline void check_erase_testset(struct maple_tree *mt) +static noinline void __init check_erase_testset(struct maple_tree *mt) { - unsigned long set[] = { 5015, 5014, 5017, 25, 1000, - 1001, 1002, 1003, 1005, 0, - 6003, 6002, 6008, 6012, 6015, - 7003, 7002, 7008, 7012, 7015, - 8003, 8002, 8008, 8012, 8015, - 9003, 9002, 9008, 9012, 9015, - 10003, 10002, 10008, 10012, 10015, - 11003, 11002, 11008, 11012, 11015, - 12003, 12002, 12008, 12012, 12015, - 13003, 13002, 13008, 13012, 13015, - 14003, 14002, 14008, 14012, 14015, - 15003, 15002, 15008, 15012, 15015, - }; - - - void *ptr = &set; + static const unsigned long set[] = { 5015, 5014, 5017, 25, 1000, + 1001, 1002, 1003, 1005, 0, + 6003, 6002, 6008, 6012, 6015, + 7003, 7002, 7008, 7012, 7015, + 8003, 8002, 8008, 8012, 8015, + 9003, 9002, 9008, 9012, 9015, + 10003, 10002, 10008, 10012, 10015, + 11003, 11002, 11008, 11012, 11015, + 12003, 12002, 12008, 12012, 12015, + 13003, 13002, 13008, 13012, 13015, + 14003, 14002, 14008, 14012, 14015, + 15003, 15002, 15008, 15012, 15015, + }; + + + void *ptr = &check_erase_testset; void *entry[2] = { ptr, mt }; void *root_node; @@ -739,7 +739,7 @@ static noinline void check_erase_testset(struct maple_tree *mt) int mas_ce2_over_count(struct ma_state *mas_start, struct ma_state *mas_end, void *s_entry, unsigned long s_min, void *e_entry, unsigned long e_max, - unsigned long *set, int i, bool null_entry) + const unsigned long *set, int i, bool null_entry) { int count = 0, span = 0; unsigned long retry = 0; @@ -969,8 +969,8 @@ retry: } #if defined(CONFIG_64BIT) -static noinline void check_erase2_testset(struct maple_tree *mt, - unsigned long *set, unsigned long size) +static noinline void __init check_erase2_testset(struct maple_tree *mt, + const unsigned long *set, unsigned long size) { int entry_count = 0; int check = 0; @@ -1054,7 +1054,7 @@ static noinline void check_erase2_testset(struct maple_tree *mt, if (entry_count) MT_BUG_ON(mt, !mt_height(mt)); #if check_erase2_debug > 1 - mt_dump(mt); + mt_dump(mt, mt_dump_hex); #endif #if check_erase2_debug pr_err("Done\n"); @@ -1085,7 +1085,7 @@ static noinline void check_erase2_testset(struct maple_tree *mt, mas_for_each(&mas, foo, ULONG_MAX) { if (xa_is_zero(foo)) { if (addr == mas.index) { - mt_dump(mas.tree); + mt_dump(mas.tree, mt_dump_hex); pr_err("retry failed %lu - %lu\n", mas.index, mas.last); MT_BUG_ON(mt, 1); @@ -1114,11 +1114,11 @@ static noinline void check_erase2_testset(struct maple_tree *mt, /* These tests were pulled from KVM tree modifications which failed. */ -static noinline void check_erase2_sets(struct maple_tree *mt) +static noinline void __init check_erase2_sets(struct maple_tree *mt) { void *entry; unsigned long start = 0; - unsigned long set[] = { + static const unsigned long set[] = { STORE, 140737488347136, 140737488351231, STORE, 140721266458624, 140737488351231, ERASE, 140721266458624, 140737488351231, @@ -1136,7 +1136,7 @@ ERASE, 140253902692352, 140253902864383, STORE, 140253902692352, 140253902696447, STORE, 140253902696448, 140253902864383, }; - unsigned long set2[] = { + static const unsigned long set2[] = { STORE, 140737488347136, 140737488351231, STORE, 140735933583360, 140737488351231, ERASE, 140735933583360, 140737488351231, @@ -1160,7 +1160,7 @@ STORE, 140277094813696, 140277094821887, STORE, 140277094821888, 140277094825983, STORE, 140735933906944, 140735933911039, }; - unsigned long set3[] = { + static const unsigned long set3[] = { STORE, 140737488347136, 140737488351231, STORE, 140735790264320, 140737488351231, ERASE, 140735790264320, 140737488351231, @@ -1203,7 +1203,7 @@ STORE, 47135835840512, 47135835885567, STORE, 47135835885568, 47135835893759, }; - unsigned long set4[] = { + static const unsigned long set4[] = { STORE, 140737488347136, 140737488351231, STORE, 140728251703296, 140737488351231, ERASE, 140728251703296, 140737488351231, @@ -1224,7 +1224,7 @@ ERASE, 47646523277312, 47646523445247, STORE, 47646523277312, 47646523400191, }; - unsigned long set5[] = { + static const unsigned long set5[] = { STORE, 140737488347136, 140737488351231, STORE, 140726874062848, 140737488351231, ERASE, 140726874062848, 140737488351231, @@ -1357,7 +1357,7 @@ STORE, 47884791619584, 47884791623679, STORE, 47884791623680, 47884791627775, }; - unsigned long set6[] = { + static const unsigned long set6[] = { STORE, 140737488347136, 140737488351231, STORE, 140722999021568, 140737488351231, ERASE, 140722999021568, 140737488351231, @@ -1489,7 +1489,7 @@ ERASE, 47430432014336, 47430432022527, STORE, 47430432014336, 47430432018431, STORE, 47430432018432, 47430432022527, }; - unsigned long set7[] = { + static const unsigned long set7[] = { STORE, 140737488347136, 140737488351231, STORE, 140729808330752, 140737488351231, ERASE, 140729808330752, 140737488351231, @@ -1621,7 +1621,7 @@ ERASE, 47439987130368, 47439987138559, STORE, 47439987130368, 47439987134463, STORE, 47439987134464, 47439987138559, }; - unsigned long set8[] = { + static const unsigned long set8[] = { STORE, 140737488347136, 140737488351231, STORE, 140722482974720, 140737488351231, ERASE, 140722482974720, 140737488351231, @@ -1754,7 +1754,7 @@ STORE, 47708488638464, 47708488642559, STORE, 47708488642560, 47708488646655, }; - unsigned long set9[] = { + static const unsigned long set9[] = { STORE, 140737488347136, 140737488351231, STORE, 140736427839488, 140737488351231, ERASE, 140736427839488, 140736427839488, @@ -5620,7 +5620,7 @@ ERASE, 47906195480576, 47906195480576, STORE, 94641242615808, 94641242750975, }; - unsigned long set10[] = { + static const unsigned long set10[] = { STORE, 140737488347136, 140737488351231, STORE, 140736427839488, 140737488351231, ERASE, 140736427839488, 140736427839488, @@ -9484,7 +9484,7 @@ STORE, 139726599680000, 139726599684095, ERASE, 47906195480576, 47906195480576, STORE, 94641242615808, 94641242750975, }; - unsigned long set11[] = { + static const unsigned long set11[] = { STORE, 140737488347136, 140737488351231, STORE, 140732658499584, 140737488351231, ERASE, 140732658499584, 140732658499584, @@ -9510,7 +9510,7 @@ STORE, 140732658565120, 140732658569215, STORE, 140732658552832, 140732658565119, }; - unsigned long set12[] = { /* contains 12 values. */ + static const unsigned long set12[] = { /* contains 12 values. */ STORE, 140737488347136, 140737488351231, STORE, 140732658499584, 140737488351231, ERASE, 140732658499584, 140732658499584, @@ -9537,7 +9537,7 @@ STORE, 140732658552832, 140732658565119, STORE, 140014592741375, 140014592741375, /* contrived */ STORE, 140014592733184, 140014592741376, /* creates first entry retry. */ }; - unsigned long set13[] = { + static const unsigned long set13[] = { STORE, 140373516247040, 140373516251135,/*: ffffa2e7b0e10d80 */ STORE, 140373516251136, 140373516255231,/*: ffffa2e7b1195d80 */ STORE, 140373516255232, 140373516443647,/*: ffffa2e7b0e109c0 */ @@ -9550,7 +9550,7 @@ STORE, 140373518684160, 140373518688254,/*: ffffa2e7b05fec00 */ STORE, 140373518688256, 140373518692351,/*: ffffa2e7bfbdcd80 */ STORE, 140373518692352, 140373518696447,/*: ffffa2e7b0749e40 */ }; - unsigned long set14[] = { + static const unsigned long set14[] = { STORE, 140737488347136, 140737488351231, STORE, 140731667996672, 140737488351231, SNULL, 140731668000767, 140737488351231, @@ -9834,7 +9834,7 @@ SNULL, 139826136543232, 139826136809471, STORE, 139826136809472, 139826136842239, STORE, 139826136543232, 139826136809471, }; - unsigned long set15[] = { + static const unsigned long set15[] = { STORE, 140737488347136, 140737488351231, STORE, 140722061451264, 140737488351231, SNULL, 140722061455359, 140737488351231, @@ -10119,7 +10119,7 @@ STORE, 139906808958976, 139906808991743, STORE, 139906808692736, 139906808958975, }; - unsigned long set16[] = { + static const unsigned long set16[] = { STORE, 94174808662016, 94174809321471, STORE, 94174811414528, 94174811426815, STORE, 94174811426816, 94174811430911, @@ -10330,7 +10330,7 @@ STORE, 139921865613312, 139921865617407, STORE, 139921865547776, 139921865564159, }; - unsigned long set17[] = { + static const unsigned long set17[] = { STORE, 94397057224704, 94397057646591, STORE, 94397057650688, 94397057691647, STORE, 94397057691648, 94397057695743, @@ -10392,7 +10392,7 @@ STORE, 140720477511680, 140720477646847, STORE, 140720478302208, 140720478314495, STORE, 140720478314496, 140720478318591, }; - unsigned long set18[] = { + static const unsigned long set18[] = { STORE, 140737488347136, 140737488351231, STORE, 140724953673728, 140737488351231, SNULL, 140724953677823, 140737488351231, @@ -10425,7 +10425,7 @@ STORE, 140222970597376, 140222970605567, ERASE, 140222970597376, 140222970605567, STORE, 140222970597376, 140222970605567, }; - unsigned long set19[] = { + static const unsigned long set19[] = { STORE, 140737488347136, 140737488351231, STORE, 140725182459904, 140737488351231, SNULL, 140725182463999, 140737488351231, @@ -10694,7 +10694,7 @@ STORE, 140656836775936, 140656836780031, STORE, 140656787476480, 140656791920639, ERASE, 140656774639616, 140656779083775, }; - unsigned long set20[] = { + static const unsigned long set20[] = { STORE, 140737488347136, 140737488351231, STORE, 140735952392192, 140737488351231, SNULL, 140735952396287, 140737488351231, @@ -10850,7 +10850,7 @@ STORE, 140590386819072, 140590386823167, STORE, 140590386823168, 140590386827263, SNULL, 140590376591359, 140590376595455, }; - unsigned long set21[] = { + static const unsigned long set21[] = { STORE, 93874710941696, 93874711363583, STORE, 93874711367680, 93874711408639, STORE, 93874711408640, 93874711412735, @@ -10920,7 +10920,7 @@ ERASE, 140708393312256, 140708393316351, ERASE, 140708393308160, 140708393312255, ERASE, 140708393291776, 140708393308159, }; - unsigned long set22[] = { + static const unsigned long set22[] = { STORE, 93951397134336, 93951397183487, STORE, 93951397183488, 93951397728255, STORE, 93951397728256, 93951397826559, @@ -11047,7 +11047,7 @@ STORE, 140551361253376, 140551361519615, ERASE, 140551361253376, 140551361519615, }; - unsigned long set23[] = { + static const unsigned long set23[] = { STORE, 94014447943680, 94014448156671, STORE, 94014450253824, 94014450257919, STORE, 94014450257920, 94014450266111, @@ -14371,7 +14371,7 @@ SNULL, 140175956627455, 140175985139711, STORE, 140175927242752, 140175956627455, STORE, 140175956627456, 140175985139711, }; - unsigned long set24[] = { + static const unsigned long set24[] = { STORE, 140737488347136, 140737488351231, STORE, 140735281639424, 140737488351231, SNULL, 140735281643519, 140737488351231, @@ -15533,7 +15533,7 @@ ERASE, 139635393024000, 139635401412607, ERASE, 139635384627200, 139635384631295, ERASE, 139635384631296, 139635393019903, }; - unsigned long set25[] = { + static const unsigned long set25[] = { STORE, 140737488347136, 140737488351231, STORE, 140737488343040, 140737488351231, STORE, 140722547441664, 140737488351231, @@ -22321,7 +22321,7 @@ STORE, 140249652703232, 140249682087935, STORE, 140249682087936, 140249710600191, }; - unsigned long set26[] = { + static const unsigned long set26[] = { STORE, 140737488347136, 140737488351231, STORE, 140729464770560, 140737488351231, SNULL, 140729464774655, 140737488351231, @@ -22345,7 +22345,7 @@ ERASE, 140109040951296, 140109040959487, STORE, 140109040955392, 140109040959487, ERASE, 140109040955392, 140109040959487, }; - unsigned long set27[] = { + static const unsigned long set27[] = { STORE, 140737488347136, 140737488351231, STORE, 140726128070656, 140737488351231, SNULL, 140726128074751, 140737488351231, @@ -22741,7 +22741,7 @@ STORE, 140415509696512, 140415535910911, ERASE, 140415537422336, 140415562588159, STORE, 140415482433536, 140415509696511, }; - unsigned long set28[] = { + static const unsigned long set28[] = { STORE, 140737488347136, 140737488351231, STORE, 140722475622400, 140737488351231, SNULL, 140722475626495, 140737488351231, @@ -22809,7 +22809,7 @@ STORE, 139918413348864, 139918413352959, ERASE, 139918413316096, 139918413344767, STORE, 93865848528896, 93865848664063, }; - unsigned long set29[] = { + static const unsigned long set29[] = { STORE, 140737488347136, 140737488351231, STORE, 140734467944448, 140737488351231, SNULL, 140734467948543, 140737488351231, @@ -23684,7 +23684,7 @@ ERASE, 140143079972864, 140143088361471, ERASE, 140143205793792, 140143205797887, ERASE, 140143205797888, 140143214186495, }; - unsigned long set30[] = { + static const unsigned long set30[] = { STORE, 140737488347136, 140737488351231, STORE, 140733436743680, 140737488351231, SNULL, 140733436747775, 140737488351231, @@ -24566,7 +24566,7 @@ ERASE, 140165225893888, 140165225897983, ERASE, 140165225897984, 140165234286591, ERASE, 140165058105344, 140165058109439, }; - unsigned long set31[] = { + static const unsigned long set31[] = { STORE, 140737488347136, 140737488351231, STORE, 140730890784768, 140737488351231, SNULL, 140730890788863, 140737488351231, @@ -25379,7 +25379,7 @@ ERASE, 140623906590720, 140623914979327, ERASE, 140622950277120, 140622950281215, ERASE, 140622950281216, 140622958669823, }; - unsigned long set32[] = { + static const unsigned long set32[] = { STORE, 140737488347136, 140737488351231, STORE, 140731244212224, 140737488351231, SNULL, 140731244216319, 140737488351231, @@ -26175,7 +26175,7 @@ ERASE, 140400417288192, 140400425676799, ERASE, 140400283066368, 140400283070463, ERASE, 140400283070464, 140400291459071, }; - unsigned long set33[] = { + static const unsigned long set33[] = { STORE, 140737488347136, 140737488351231, STORE, 140734562918400, 140737488351231, SNULL, 140734562922495, 140737488351231, @@ -26317,7 +26317,7 @@ STORE, 140582961786880, 140583003750399, ERASE, 140582961786880, 140583003750399, }; - unsigned long set34[] = { + static const unsigned long set34[] = { STORE, 140737488347136, 140737488351231, STORE, 140731327180800, 140737488351231, SNULL, 140731327184895, 140737488351231, @@ -27198,7 +27198,7 @@ ERASE, 140012522094592, 140012530483199, ERASE, 140012033142784, 140012033146879, ERASE, 140012033146880, 140012041535487, }; - unsigned long set35[] = { + static const unsigned long set35[] = { STORE, 140737488347136, 140737488351231, STORE, 140730536939520, 140737488351231, SNULL, 140730536943615, 140737488351231, @@ -27955,7 +27955,7 @@ ERASE, 140474471936000, 140474480324607, ERASE, 140474396430336, 140474396434431, ERASE, 140474396434432, 140474404823039, }; - unsigned long set36[] = { + static const unsigned long set36[] = { STORE, 140737488347136, 140737488351231, STORE, 140723893125120, 140737488351231, SNULL, 140723893129215, 140737488351231, @@ -28816,7 +28816,7 @@ ERASE, 140121890357248, 140121898745855, ERASE, 140121269587968, 140121269592063, ERASE, 140121269592064, 140121277980671, }; - unsigned long set37[] = { + static const unsigned long set37[] = { STORE, 140737488347136, 140737488351231, STORE, 140722404016128, 140737488351231, SNULL, 140722404020223, 140737488351231, @@ -28942,7 +28942,7 @@ STORE, 139759821246464, 139759888355327, ERASE, 139759821246464, 139759888355327, ERASE, 139759888355328, 139759955464191, }; - unsigned long set38[] = { + static const unsigned long set38[] = { STORE, 140737488347136, 140737488351231, STORE, 140730666221568, 140737488351231, SNULL, 140730666225663, 140737488351231, @@ -29752,7 +29752,7 @@ ERASE, 140613504712704, 140613504716799, ERASE, 140613504716800, 140613513105407, }; - unsigned long set39[] = { + static const unsigned long set39[] = { STORE, 140737488347136, 140737488351231, STORE, 140736271417344, 140737488351231, SNULL, 140736271421439, 140737488351231, @@ -30124,7 +30124,7 @@ STORE, 140325364428800, 140325372821503, STORE, 140325356036096, 140325364428799, SNULL, 140325364432895, 140325372821503, }; - unsigned long set40[] = { + static const unsigned long set40[] = { STORE, 140737488347136, 140737488351231, STORE, 140734309167104, 140737488351231, SNULL, 140734309171199, 140737488351231, @@ -30875,7 +30875,7 @@ ERASE, 140320289300480, 140320289304575, ERASE, 140320289304576, 140320297693183, ERASE, 140320163409920, 140320163414015, }; - unsigned long set41[] = { + static const unsigned long set41[] = { STORE, 140737488347136, 140737488351231, STORE, 140728157171712, 140737488351231, SNULL, 140728157175807, 140737488351231, @@ -31185,7 +31185,7 @@ STORE, 94376135090176, 94376135094271, STORE, 94376135094272, 94376135098367, SNULL, 94376135094272, 94377208836095, }; - unsigned long set42[] = { + static const unsigned long set42[] = { STORE, 314572800, 1388314623, STORE, 1462157312, 1462169599, STORE, 1462169600, 1462185983, @@ -33862,7 +33862,7 @@ SNULL, 3798999040, 3799101439, */ }; - unsigned long set43[] = { + static const unsigned long set43[] = { STORE, 140737488347136, 140737488351231, STORE, 140734187720704, 140737488351231, SNULL, 140734187724800, 140737488351231, @@ -34513,7 +34513,7 @@ static void *rcu_reader_rev(void *ptr) if (mas.index != r_start) { alt = xa_mk_value(index + i * 2 + 1 + RCU_RANGE_COUNT); - mt_dump(test->mt); + mt_dump(test->mt, mt_dump_dec); printk("Error: %lu-%lu %p != %lu-%lu %p %p line %d i %d\n", mas.index, mas.last, entry, r_start, r_end, expected, alt, @@ -34996,7 +34996,7 @@ void run_check_rcu_slowread(struct maple_tree *mt, struct rcu_test_struct *vals) MT_BUG_ON(mt, !vals->seen_entry3); MT_BUG_ON(mt, !vals->seen_both); } -static noinline void check_rcu_simulated(struct maple_tree *mt) +static noinline void __init check_rcu_simulated(struct maple_tree *mt) { unsigned long i, nr_entries = 1000; unsigned long target = 4320; @@ -35157,7 +35157,7 @@ static noinline void check_rcu_simulated(struct maple_tree *mt) rcu_unregister_thread(); } -static noinline void check_rcu_threaded(struct maple_tree *mt) +static noinline void __init check_rcu_threaded(struct maple_tree *mt) { unsigned long i, nr_entries = 1000; struct rcu_test_struct vals; @@ -35259,6 +35259,7 @@ static void mas_dfs_preorder(struct ma_state *mas) struct maple_enode *prev; unsigned char end, slot = 0; + unsigned long *pivots; if (mas->node == MAS_START) { mas_start(mas); @@ -35291,6 +35292,9 @@ walk_up: mas_ascend(mas); goto walk_up; } + pivots = ma_pivots(mte_to_node(prev), mte_node_type(prev)); + mas->max = mas_safe_pivot(mas, pivots, slot, mte_node_type(prev)); + mas->min = mas_safe_min(mas, pivots, slot); return; done: @@ -35366,7 +35370,7 @@ static void check_dfs_preorder(struct maple_tree *mt) /* End of depth first search tests */ /* Preallocation testing */ -static noinline void check_prealloc(struct maple_tree *mt) +static noinline void __init check_prealloc(struct maple_tree *mt) { unsigned long i, max = 100; unsigned long allocated; @@ -35494,7 +35498,7 @@ static noinline void check_prealloc(struct maple_tree *mt) /* End of preallocation testing */ /* Spanning writes, writes that span nodes and layers of the tree */ -static noinline void check_spanning_write(struct maple_tree *mt) +static noinline void __init check_spanning_write(struct maple_tree *mt) { unsigned long i, max = 5000; MA_STATE(mas, mt, 1200, 2380); @@ -35662,7 +35666,7 @@ static noinline void check_spanning_write(struct maple_tree *mt) /* End of spanning write testing */ /* Writes to a NULL area that are adjacent to other NULLs */ -static noinline void check_null_expand(struct maple_tree *mt) +static noinline void __init check_null_expand(struct maple_tree *mt) { unsigned long i, max = 100; unsigned char data_end; @@ -35723,7 +35727,7 @@ static noinline void check_null_expand(struct maple_tree *mt) /* End of NULL area expansions */ /* Checking for no memory is best done outside the kernel */ -static noinline void check_nomem(struct maple_tree *mt) +static noinline void __init check_nomem(struct maple_tree *mt) { MA_STATE(ms, mt, 1, 1); @@ -35758,7 +35762,7 @@ static noinline void check_nomem(struct maple_tree *mt) mtree_destroy(mt); } -static noinline void check_locky(struct maple_tree *mt) +static noinline void __init check_locky(struct maple_tree *mt) { MA_STATE(ms, mt, 2, 2); MA_STATE(reader, mt, 2, 2); @@ -35780,10 +35784,10 @@ void farmer_tests(void) struct maple_node *node; DEFINE_MTREE(tree); - mt_dump(&tree); + mt_dump(&tree, mt_dump_dec); tree.ma_root = xa_mk_value(0); - mt_dump(&tree); + mt_dump(&tree, mt_dump_dec); node = mt_alloc_one(GFP_KERNEL); node->parent = (void *)((unsigned long)(&tree) | 1); @@ -35793,7 +35797,7 @@ void farmer_tests(void) node->mr64.pivot[1] = 1; node->mr64.pivot[2] = 0; tree.ma_root = mt_mk_node(node, maple_leaf_64); - mt_dump(&tree); + mt_dump(&tree, mt_dump_dec); node->parent = ma_parent_ptr(node); ma_free_rcu(node); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 90a62cf75008..5d6fc3f39284 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -4,6 +4,7 @@ TARGETS += amd-pstate TARGETS += arm64 TARGETS += bpf TARGETS += breakpoints +TARGETS += cachestat TARGETS += capabilities TARGETS += cgroup TARGETS += clone3 diff --git a/tools/testing/selftests/cachestat/.gitignore b/tools/testing/selftests/cachestat/.gitignore new file mode 100644 index 000000000000..d6c30b43a4bb --- /dev/null +++ b/tools/testing/selftests/cachestat/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +test_cachestat diff --git a/tools/testing/selftests/cachestat/Makefile b/tools/testing/selftests/cachestat/Makefile new file mode 100644 index 000000000000..fca73aaa7d14 --- /dev/null +++ b/tools/testing/selftests/cachestat/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +TEST_GEN_PROGS := test_cachestat + +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -Wall +CFLAGS += -lrt + +include ../lib.mk diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c new file mode 100644 index 000000000000..54d09b820ed4 --- /dev/null +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdbool.h> +#include <linux/kernel.h> +#include <linux/mman.h> +#include <sys/mman.h> +#include <sys/shm.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <string.h> +#include <fcntl.h> +#include <errno.h> + +#include "../kselftest.h" + +static const char * const dev_files[] = { + "/dev/zero", "/dev/null", "/dev/urandom", + "/proc/version", "/proc" +}; +static const int cachestat_nr = 451; + +void print_cachestat(struct cachestat *cs) +{ + ksft_print_msg( + "Using cachestat: Cached: %lu, Dirty: %lu, Writeback: %lu, Evicted: %lu, Recently Evicted: %lu\n", + cs->nr_cache, cs->nr_dirty, cs->nr_writeback, + cs->nr_evicted, cs->nr_recently_evicted); +} + +bool write_exactly(int fd, size_t filesize) +{ + int random_fd = open("/dev/urandom", O_RDONLY); + char *cursor, *data; + int remained; + bool ret; + + if (random_fd < 0) { + ksft_print_msg("Unable to access urandom.\n"); + ret = false; + goto out; + } + + data = malloc(filesize); + if (!data) { + ksft_print_msg("Unable to allocate data.\n"); + ret = false; + goto close_random_fd; + } + + remained = filesize; + cursor = data; + + while (remained) { + ssize_t read_len = read(random_fd, cursor, remained); + + if (read_len <= 0) { + ksft_print_msg("Unable to read from urandom.\n"); + ret = false; + goto out_free_data; + } + + remained -= read_len; + cursor += read_len; + } + + /* write random data to fd */ + remained = filesize; + cursor = data; + while (remained) { + ssize_t write_len = write(fd, cursor, remained); + + if (write_len <= 0) { + ksft_print_msg("Unable write random data to file.\n"); + ret = false; + goto out_free_data; + } + + remained -= write_len; + cursor += write_len; + } + + ret = true; +out_free_data: + free(data); +close_random_fd: + close(random_fd); +out: + return ret; +} + +/* + * Open/create the file at filename, (optionally) write random data to it + * (exactly num_pages), then test the cachestat syscall on this file. + * + * If test_fsync == true, fsync the file, then check the number of dirty + * pages. + */ +bool test_cachestat(const char *filename, bool write_random, bool create, + bool test_fsync, unsigned long num_pages, int open_flags, + mode_t open_mode) +{ + size_t PS = sysconf(_SC_PAGESIZE); + int filesize = num_pages * PS; + bool ret = true; + long syscall_ret; + struct cachestat cs; + struct cachestat_range cs_range = { 0, filesize }; + + int fd = open(filename, open_flags, open_mode); + + if (fd == -1) { + ksft_print_msg("Unable to create/open file.\n"); + ret = false; + goto out; + } else { + ksft_print_msg("Create/open %s\n", filename); + } + + if (write_random) { + if (!write_exactly(fd, filesize)) { + ksft_print_msg("Unable to access urandom.\n"); + ret = false; + goto out1; + } + } + + syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0); + + ksft_print_msg("Cachestat call returned %ld\n", syscall_ret); + + if (syscall_ret) { + ksft_print_msg("Cachestat returned non-zero.\n"); + ret = false; + goto out1; + + } else { + print_cachestat(&cs); + + if (write_random) { + if (cs.nr_cache + cs.nr_evicted != num_pages) { + ksft_print_msg( + "Total number of cached and evicted pages is off.\n"); + ret = false; + } + } + } + + if (test_fsync) { + if (fsync(fd)) { + ksft_print_msg("fsync fails.\n"); + ret = false; + } else { + syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0); + + ksft_print_msg("Cachestat call (after fsync) returned %ld\n", + syscall_ret); + + if (!syscall_ret) { + print_cachestat(&cs); + + if (cs.nr_dirty) { + ret = false; + ksft_print_msg( + "Number of dirty should be zero after fsync.\n"); + } + } else { + ksft_print_msg("Cachestat (after fsync) returned non-zero.\n"); + ret = false; + goto out1; + } + } + } + +out1: + close(fd); + + if (create) + remove(filename); +out: + return ret; +} + +bool test_cachestat_shmem(void) +{ + size_t PS = sysconf(_SC_PAGESIZE); + size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ + int syscall_ret; + size_t compute_len = PS * 512; + struct cachestat_range cs_range = { PS, compute_len }; + char *filename = "tmpshmcstat"; + struct cachestat cs; + bool ret = true; + unsigned long num_pages = compute_len / PS; + int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + + if (fd < 0) { + ksft_print_msg("Unable to create shmem file.\n"); + ret = false; + goto out; + } + + if (ftruncate(fd, filesize)) { + ksft_print_msg("Unable to truncate shmem file.\n"); + ret = false; + goto close_fd; + } + + if (!write_exactly(fd, filesize)) { + ksft_print_msg("Unable to write to shmem file.\n"); + ret = false; + goto close_fd; + } + + syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0); + + if (syscall_ret) { + ksft_print_msg("Cachestat returned non-zero.\n"); + ret = false; + goto close_fd; + } else { + print_cachestat(&cs); + if (cs.nr_cache + cs.nr_evicted != num_pages) { + ksft_print_msg( + "Total number of cached and evicted pages is off.\n"); + ret = false; + } + } + +close_fd: + shm_unlink(filename); +out: + return ret; +} + +int main(void) +{ + int ret = 0; + + for (int i = 0; i < 5; i++) { + const char *dev_filename = dev_files[i]; + + if (test_cachestat(dev_filename, false, false, false, + 4, O_RDONLY, 0400)) + ksft_test_result_pass("cachestat works with %s\n", dev_filename); + else { + ksft_test_result_fail("cachestat fails with %s\n", dev_filename); + ret = 1; + } + } + + if (test_cachestat("tmpfilecachestat", true, true, + true, 4, O_CREAT | O_RDWR, 0400 | 0600)) + ksft_test_result_pass("cachestat works with a normal file\n"); + else { + ksft_test_result_fail("cachestat fails with normal file\n"); + ret = 1; + } + + if (test_cachestat_shmem()) + ksft_test_result_pass("cachestat works with a shmem file\n"); + else { + ksft_test_result_fail("cachestat fails with a shmem file\n"); + ret = 1; + } + + return ret; +} diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 9bfe1d6f6529..e033c79d528e 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -61,8 +61,7 @@ static void async_pf_execute(struct work_struct *work) * access remotely. */ mmap_read_lock(mm); - get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, NULL, - &locked); + get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); if (locked) mmap_read_unlock(mm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb5c13eee193..eaa5bb8dbadc 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2477,7 +2477,7 @@ static inline int check_user_page_hwpoison(unsigned long addr) { int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - rc = get_user_pages(addr, 1, flags, NULL, NULL); + rc = get_user_pages(addr, 1, flags, NULL); return rc == -EHWPOISON; } |