summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--MAINTAINERS7
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/arm64/kernel/mte.c17
-rw-r--r--arch/arm64/kernel/traps.c2
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/arm64/mm/init.c2
-rw-r--r--arch/ia64/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/include/asm/fw/cfe/cfe_api.h3
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c2
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/kvm/interrupt.c2
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c2
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--drivers/dma-buf/heaps/system_heap.c5
-rw-r--r--drivers/gpu/drm/radeon/radeon_ttm.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c2
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.c2
-rw-r--r--drivers/iommu/iommufd/pages.c4
-rw-r--r--drivers/media/v4l2-core/videobuf-dma-sg.c2
-rw-r--r--drivers/misc/sgi-gru/grufault.c2
-rw-r--r--drivers/vdpa/vdpa_user/vduse_dev.c2
-rw-r--r--drivers/vfio/vfio_iommu_type1.c2
-rw-r--r--drivers/vhost/vdpa.c2
-rw-r--r--fs/exec.c2
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--fs/hugetlbfs/inode.c5
-rw-r--r--fs/ntfs/mft.c38
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/task_mmu.c12
-rw-r--r--fs/squashfs/block.c115
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c12
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--include/linux/dma-map-ops.h6
-rw-r--r--include/linux/fault-inject.h9
-rw-r--r--include/linux/gfp.h15
-rw-r--r--include/linux/hugetlb.h23
-rw-r--r--include/linux/kasan.h2
-rw-r--r--include/linux/maple_tree.h130
-rw-r--r--include/linux/memcontrol.h5
-rw-r--r--include/linux/memory_hotplug.h3
-rw-r--r--include/linux/mm.h152
-rw-r--r--include/linux/mmdebug.h14
-rw-r--r--include/linux/mmzone.h21
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/stackdepot.h9
-rw-r--r--include/linux/suspend.h9
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--include/linux/types.h1
-rw-r--r--include/uapi/asm-generic/unistd.h5
-rw-r--r--include/uapi/linux/mman.h14
-rw-r--r--init/Kconfig10
-rw-r--r--io_uring/rsrc.c34
-rw-r--r--kernel/cgroup/rstat.c26
-rw-r--r--kernel/dma/Kconfig6
-rw-r--r--kernel/dma/contiguous.c8
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/power/main.c27
-rw-r--r--kernel/power/power.h5
-rw-r--r--kernel/power/snapshot.c52
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c67
-rw-r--r--kernel/trace/trace_events_user.c2
-rw-r--r--lib/Kconfig.debug10
-rw-r--r--lib/Makefile2
-rw-r--r--lib/maple_tree.c1170
-rw-r--r--lib/show_mem.c37
-rw-r--r--lib/stackdepot.c128
-rw-r--r--lib/test_maple_tree.c863
-rw-r--r--lib/zstd/common/zstd_deps.h18
-rw-r--r--mm/Makefile4
-rw-r--r--mm/compaction.c72
-rw-r--r--mm/debug.c9
-rw-r--r--mm/debug_page_alloc.c59
-rw-r--r--mm/dmapool.c10
-rw-r--r--mm/fail_page_alloc.c66
-rw-r--r--mm/filemap.c201
-rw-r--r--mm/gup.c255
-rw-r--r--mm/gup_test.c14
-rw-r--r--mm/hugetlb.c24
-rw-r--r--mm/hugetlb_vmemmap.c11
-rw-r--r--mm/internal.h41
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kasan/generic.c72
-rw-r--r--mm/kasan/kasan.h159
-rw-r--r--mm/kasan/report.c17
-rw-r--r--mm/kasan/report_generic.c12
-rw-r--r--mm/kasan/report_hw_tags.c2
-rw-r--r--mm/kasan/report_sw_tags.c2
-rw-r--r--mm/kasan/shadow.c36
-rw-r--r--mm/kasan/sw_tags.c20
-rw-r--r--mm/memcontrol.c192
-rw-r--r--mm/memory-failure.c11
-rw-r--r--mm/memory.c20
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/migrate.c161
-rw-r--r--mm/mm_init.c32
-rw-r--r--mm/mmap.c169
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page_alloc.c848
-rw-r--r--mm/page_owner.c54
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/secretmem.c2
-rw-r--r--mm/show_mem.c429
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/workingset.c150
-rw-r--r--mm/zsmalloc.c12
-rw-r--r--net/xdp/xdp_umem.c2
-rw-r--r--scripts/spelling.txt22
-rw-r--r--security/tomoyo/domain.c2
-rw-r--r--tools/testing/radix-tree/linux/init.h1
-rw-r--r--tools/testing/radix-tree/maple.c164
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/cachestat/.gitignore2
-rw-r--r--tools/testing/selftests/cachestat/Makefile8
-rw-r--r--tools/testing/selftests/cachestat/test_cachestat.c269
-rw-r--r--virt/kvm/async_pf.c3
-rw-r--r--virt/kvm/kvm_main.c2
139 files changed, 4388 insertions, 2486 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9e5bab29685f..a341ef853fc0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -692,7 +692,7 @@
kernel/dma/contiguous.c
cma_pernuma=nn[MG]
- [ARM64,KNL,CMA]
+ [KNL,CMA]
Sets the size of kernel per-numa memory area for
contiguous memory allocations. A value of 0 disables
per-numa CMA altogether. And If this option is not
diff --git a/MAINTAINERS b/MAINTAINERS
index 6d0f112eb649..67be30a92cc1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4478,6 +4478,13 @@ S: Supported
F: Documentation/filesystems/caching/cachefiles.rst
F: fs/cachefiles/
+CACHESTAT: PAGE CACHE STATS FOR A FILE
+M: Nhat Pham <nphamcs@gmail.com>
+M: Johannes Weiner <hannes@cmpxchg.org>
+L: linux-mm@kvack.org
+S: Maintained
+F: tools/testing/selftests/cachestat/test_cachestat.c
+
CADENCE MIPI-CSI2 BRIDGES
M: Maxime Ripard <mripard@kernel.org>
L: linux-media@vger.kernel.org
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 8ebacf37a8cf..1f13995d00d7 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -490,3 +490,4 @@
558 common process_mrelease sys_process_mrelease
559 common futex_waitv sys_futex_waitv
560 common set_mempolicy_home_node sys_ni_syscall
+561 common cachestat sys_cachestat
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index ac964612d8b0..8ebed8a13874 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -464,3 +464,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 037feba03a51..64a514f90131 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 451
+#define __NR_compat_syscalls 452
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 604a2053d006..d952a28463e0 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -907,6 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
#define __NR_set_mempolicy_home_node 450
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
+#define __NR_cachestat 451
+__SYSCALL(__NR_cachestat, sys_cachestat)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 7e89968bd282..4c5ef9b20065 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -416,10 +416,9 @@ long get_mte_ctrl(struct task_struct *task)
static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
struct iovec *kiov, unsigned int gup_flags)
{
- struct vm_area_struct *vma;
void __user *buf = kiov->iov_base;
size_t len = kiov->iov_len;
- int ret;
+ int err = 0;
int write = gup_flags & FOLL_WRITE;
if (!access_ok(buf, len))
@@ -429,14 +428,16 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
return -EIO;
while (len) {
+ struct vm_area_struct *vma;
unsigned long tags, offset;
void *maddr;
- struct page *page = NULL;
+ struct page *page = get_user_page_vma_remote(mm, addr,
+ gup_flags, &vma);
- ret = get_user_pages_remote(mm, addr, 1, gup_flags, &page,
- &vma, NULL);
- if (ret <= 0)
+ if (IS_ERR_OR_NULL(page)) {
+ err = page == NULL ? -EIO : PTR_ERR(page);
break;
+ }
/*
* Only copy tags if the page has been mapped as PROT_MTE
@@ -446,7 +447,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
* was never mapped with PROT_MTE.
*/
if (!(vma->vm_flags & VM_MTE)) {
- ret = -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
put_page(page);
break;
}
@@ -479,7 +480,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
kiov->iov_len = buf - kiov->iov_base;
if (!kiov->iov_len) {
/* check for error accessing the tracee's address space */
- if (ret <= 0)
+ if (err)
return -EIO;
else
return -EFAULT;
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 4bb1b8f47298..7b889445e5c6 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -1044,7 +1044,7 @@ static int kasan_handler(struct pt_regs *regs, unsigned long esr)
bool recover = esr & KASAN_ESR_RECOVER;
bool write = esr & KASAN_ESR_WRITE;
size_t size = KASAN_ESR_SIZE(esr);
- u64 addr = regs->regs[0];
+ void *addr = (void *)regs->regs[0];
u64 pc = regs->pc;
kasan_report(addr, size, write, pc);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index cb21ccd7940d..d5047eef4295 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -317,7 +317,7 @@ static void report_tag_fault(unsigned long addr, unsigned long esr,
* find out access size.
*/
bool is_write = !!(esr & ESR_ELx_WNR);
- kasan_report(addr, 0, is_write, regs->pc);
+ kasan_report((void *)addr, 0, is_write, regs->pc);
}
#else
/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 66e70ca47680..d560aef6aafa 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -410,8 +410,6 @@ void __init bootmem_init(void)
arm64_hugetlb_cma_reserve();
#endif
- dma_pernuma_cma_reserve();
-
kvm_hyp_reserve();
/*
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 72c929d9902b..f8c74ffeeefb 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -371,3 +371,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index b1f3940bc298..4f504783371f 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -450,3 +450,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 820145e47350..858d22bf275c 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -456,3 +456,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/arch/mips/include/asm/fw/cfe/cfe_api.h b/arch/mips/include/asm/fw/cfe/cfe_api.h
index 25df2f4deb31..b52a6a9c26f1 100644
--- a/arch/mips/include/asm/fw/cfe/cfe_api.h
+++ b/arch/mips/include/asm/fw/cfe/cfe_api.h
@@ -17,9 +17,6 @@
#include <linux/types.h>
#include <linux/string.h>
-typedef long intptr_t;
-
-
/*
* Constants
*/
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 253ff994ed2e..1976317d4e8b 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -389,3 +389,4 @@
448 n32 process_mrelease sys_process_mrelease
449 n32 futex_waitv sys_futex_waitv
450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node
+451 n32 cachestat sys_cachestat
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 3f1886ad9d80..cfda2511badf 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -365,3 +365,4 @@
448 n64 process_mrelease sys_process_mrelease
449 n64 futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 n64 cachestat sys_cachestat
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 8f243e35a7b2..7692234c3768 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -438,3 +438,4 @@
448 o32 process_mrelease sys_process_mrelease
449 o32 futex_waitv sys_futex_waitv
450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node
+451 o32 cachestat sys_cachestat
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 0e42fceb2d5e..3c71fad78318 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -448,3 +448,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index a0be127475b1..8c0b08b7a80e 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -537,3 +537,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 81d7185e2ae8..d19fb1f3007d 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -105,7 +105,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
FOLL_WRITE | FOLL_LONGTERM,
- mem->hpages + entry, NULL);
+ mem->hpages + entry);
if (ret == n) {
pinned += n;
continue;
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index b68f47541169..a6935af2235c 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@
448 common process_mrelease sys_process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat sys_cachestat
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index da6dac36e959..9bd0a873f3b1 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2777,7 +2777,7 @@ static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
mmap_read_lock(kvm->mm);
get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
- &page, NULL, NULL);
+ &page, NULL);
mmap_read_unlock(kvm->mm);
return page;
}
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 2de85c977f54..97377e8c5025 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4398cc6fb68d..faa835f3c54a 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -496,3 +496,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 320480a8db4f..bc0a3c941b35 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -455,3 +455,4 @@
448 i386 process_mrelease sys_process_mrelease
449 i386 futex_waitv sys_futex_waitv
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
+451 i386 cachestat sys_cachestat
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c84d12608cd2..227538b0ce80 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -372,6 +372,7 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 21ca0a831b70..5d390df21440 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -214,7 +214,7 @@ static int __sgx_encl_add_page(struct sgx_encl *encl,
if (!(vma->vm_flags & VM_MAYEXEC))
return -EACCES;
- ret = get_user_pages(src, 1, 0, &src_page, NULL);
+ ret = get_user_pages(src, 1, 0, &src_page);
if (ret < 1)
return -EFAULT;
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 52c94ab5c205..2b69c3c035b6 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -421,3 +421,4 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
index ee7059399e9c..b7e488c1b5df 100644
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -41,7 +41,7 @@ struct dma_heap_attachment {
bool mapped;
};
-#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO)
+#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO | __GFP_RETRY_MAYFAIL)
#define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \
| __GFP_NORETRY) & ~__GFP_RECLAIM) \
| __GFP_COMP)
@@ -350,6 +350,9 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap,
struct page *page, *tmp_page;
int i, ret = -ENOMEM;
+ if (len / PAGE_SIZE > totalram_pages())
+ return ERR_PTR(-ENOMEM);
+
buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
if (!buffer)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 2220cdf6a3f6..3a9db030f98f 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -359,7 +359,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_device *bdev, struct ttm_tt *ttm
struct page **pages = ttm->pages + pinned;
r = get_user_pages(userptr, num_pages, write ? FOLL_WRITE : 0,
- pages, NULL);
+ pages);
if (r < 0)
goto release_pages;
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index f693bc753b6b..1bb7507325bc 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -111,7 +111,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
ret = pin_user_pages(start_page + got * PAGE_SIZE,
num_pages - got,
FOLL_LONGTERM | FOLL_WRITE,
- p + got, NULL);
+ p + got);
if (ret < 0) {
mmap_read_unlock(current->mm);
goto bail_release;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 2a5cac2658ec..84e0f41e7dfa 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -140,7 +140,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
ret = pin_user_pages(cur_base,
min_t(unsigned long, npages,
PAGE_SIZE / sizeof(struct page *)),
- gup_flags, page_list, NULL);
+ gup_flags, page_list);
if (ret < 0)
goto out;
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
index f51ab2ccf151..e6e25f15567d 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -422,7 +422,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
umem->page_chunk[i].plist = plist;
while (nents) {
rv = pin_user_pages(first_page_va, nents, foll_flags,
- plist, NULL);
+ plist);
if (rv < 0)
goto out_sem_up;
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 3c47846cc5ef..412ca96be128 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -786,7 +786,7 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
user->locked = 1;
}
rc = pin_user_pages_remote(pages->source_mm, uptr, npages,
- user->gup_flags, user->upages, NULL,
+ user->gup_flags, user->upages,
&user->locked);
}
if (rc <= 0) {
@@ -1799,7 +1799,7 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index,
rc = pin_user_pages_remote(
pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE),
1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page,
- NULL, NULL);
+ NULL);
mmap_read_unlock(pages->source_mm);
if (rc != 1) {
if (WARN_ON(rc >= 0))
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 53001532e8e3..405b89ea1054 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -180,7 +180,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
data, size, dma->nr_pages);
err = pin_user_pages(data & PAGE_MASK, dma->nr_pages, gup_flags,
- dma->pages, NULL);
+ dma->pages);
if (err != dma->nr_pages) {
dma->nr_pages = (err >= 0) ? err : 0;
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index b836936e9747..378cf02a2aa1 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -185,7 +185,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma,
#else
*pageshift = PAGE_SHIFT;
#endif
- if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0)
+ if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page) <= 0)
return -EFAULT;
*paddr = page_to_phys(page);
put_page(page);
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index de97e38c3b82..4d4405f058e8 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1052,7 +1052,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
goto out;
pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
- page_list, NULL);
+ page_list);
if (pinned != npages) {
ret = pinned < 0 ? pinned : -ENOMEM;
goto out;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3d4dd9420c30..3d2d9a944906 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -562,7 +562,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
mmap_read_lock(mm);
ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
- pages, NULL, NULL);
+ pages, NULL);
if (ret > 0) {
int i;
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 8c1aefc865f0..61223fcbe82b 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -983,7 +983,7 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v,
while (npages) {
sz2pin = min_t(unsigned long, npages, list_size);
pinned = pin_user_pages(cur_base, sz2pin,
- gup_flags, page_list, NULL);
+ gup_flags, page_list);
if (sz2pin != pinned) {
if (pinned < 0) {
ret = pinned;
diff --git a/fs/exec.c b/fs/exec.c
index a466e797c8e2..25c65b64544b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -220,7 +220,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
*/
mmap_read_lock(bprm->mm);
ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
- &page, NULL, NULL);
+ &page, NULL);
mmap_read_unlock(bprm->mm);
if (ret <= 0)
return NULL;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae4e51e91ee3..aca4b4811394 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2024,7 +2024,6 @@ static long wb_writeback(struct bdi_writeback *wb,
struct blk_plug plug;
blk_start_plug(&plug);
- spin_lock(&wb->list_lock);
for (;;) {
/*
* Stop writeback when nr_pages has been consumed
@@ -2049,6 +2048,9 @@ static long wb_writeback(struct bdi_writeback *wb,
if (work->for_background && !wb_over_bg_thresh(wb))
break;
+
+ spin_lock(&wb->list_lock);
+
/*
* Kupdate and background works are special and we want to
* include all inodes that need writing. Livelock avoidance is
@@ -2078,13 +2080,19 @@ static long wb_writeback(struct bdi_writeback *wb,
* mean the overall work is done. So we keep looping as long
* as made some progress on cleaning pages or inodes.
*/
- if (progress)
+ if (progress) {
+ spin_unlock(&wb->list_lock);
continue;
+ }
+
/*
* No more inodes for IO, bail
*/
- if (list_empty(&wb->b_more_io))
+ if (list_empty(&wb->b_more_io)) {
+ spin_unlock(&wb->list_lock);
break;
+ }
+
/*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
@@ -2096,9 +2104,7 @@ static long wb_writeback(struct bdi_writeback *wb,
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
- spin_lock(&wb->list_lock);
}
- spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);
return nr_pages - work->nr_pages;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ecfdfb2529a3..90361a922cec 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -834,9 +834,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
break;
}
- /* Set numa allocation policy based on index */
- hugetlb_set_vma_policy(&pseudo_vma, inode, index);
-
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
@@ -850,7 +847,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
rcu_read_unlock();
if (present) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- hugetlb_drop_vma_policy(&pseudo_vma);
continue;
}
@@ -862,6 +858,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
* folios in these areas, we need to consume the reserves
* to keep reservation accounting consistent.
*/
+ hugetlb_set_vma_policy(&pseudo_vma, inode, index);
folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
hugetlb_drop_vma_policy(&pseudo_vma);
if (IS_ERR(folio)) {
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 48030899dc6e..34102fe63c0e 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1955,36 +1955,40 @@ undo_alloc:
"attribute.%s", es);
NVolSetErrors(vol);
}
- a = ctx->attr;
+
if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
ntfs_error(vol->sb, "Failed to truncate mft data attribute "
"runlist.%s", es);
NVolSetErrors(vol);
}
- if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
- if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+ if (ctx) {
+ a = ctx->attr;
+ if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
+ if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
a->data.non_resident.mapping_pairs_offset),
old_alen - le16_to_cpu(
- a->data.non_resident.mapping_pairs_offset),
+ a->data.non_resident.mapping_pairs_offset),
rl2, ll, -1, NULL)) {
- ntfs_error(vol->sb, "Failed to restore mapping pairs "
+ ntfs_error(vol->sb, "Failed to restore mapping pairs "
"array.%s", es);
- NVolSetErrors(vol);
- }
- if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
- ntfs_error(vol->sb, "Failed to restore attribute "
+ NVolSetErrors(vol);
+ }
+ if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+ ntfs_error(vol->sb, "Failed to restore attribute "
"record.%s", es);
- NVolSetErrors(vol);
+ NVolSetErrors(vol);
+ }
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
}
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- } else if (IS_ERR(ctx->mrec)) {
- ntfs_error(vol->sb, "Failed to restore attribute search "
+ else if (IS_ERR(ctx->mrec)) {
+ ntfs_error(vol->sb, "Failed to restore attribute search "
"context.%s", es);
- NVolSetErrors(vol);
+ NVolSetErrors(vol);
+ }
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
}
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
if (!IS_ERR(mrec))
unmap_mft_record(mft_ni);
up_write(&mft_ni->runlist.lock);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 25b44b303b35..5d0cf59c4926 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -419,7 +419,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
char *notes;
size_t i = 0;
- strlcpy(prpsinfo.pr_psargs, saved_command_line,
+ strscpy(prpsinfo.pr_psargs, saved_command_line,
sizeof(prpsinfo.pr_psargs));
notes = kzalloc(notes_len, GFP_KERNEL);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 420510f6a545..6259dd432eeb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1689,23 +1689,23 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
/* watch out for wraparound */
start_vaddr = end_vaddr;
if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+ unsigned long end;
+
ret = mmap_read_lock_killable(mm);
if (ret)
goto out_free;
start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
mmap_read_unlock(mm);
+
+ end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT);
+ if (end >= start_vaddr && end < mm->task_size)
+ end_vaddr = end;
}
/* Ensure the address is inside the task */
if (start_vaddr > mm->task_size)
start_vaddr = end_vaddr;
- /*
- * The odds are that this will stop walking way
- * before end_vaddr, because the length of the
- * user buffer is tracked in "pm", and the walk
- * will stop when we hit the end of the buffer.
- */
ret = 0;
while (count && (start_vaddr < end_vaddr)) {
int len;
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index bed3bb8b27fa..2b7f4c6fa77c 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -76,10 +76,100 @@ static int copy_bio_to_actor(struct bio *bio,
return copied_bytes;
}
+static int squashfs_bio_read_cached(struct bio *fullbio,
+ struct address_space *cache_mapping, u64 index, int length,
+ u64 read_start, u64 read_end, int page_count)
+{
+ struct page *head_to_cache = NULL, *tail_to_cache = NULL;
+ struct block_device *bdev = fullbio->bi_bdev;
+ int start_idx = 0, end_idx = 0;
+ struct bvec_iter_all iter_all;
+ struct bio *bio = NULL;
+ struct bio_vec *bv;
+ int idx = 0;
+ int err = 0;
+
+ bio_for_each_segment_all(bv, fullbio, iter_all) {
+ struct page *page = bv->bv_page;
+
+ if (page->mapping == cache_mapping && PageUptodate(page)) {
+ idx++;
+ continue;
+ }
+
+ /*
+ * We only use this when the device block size is the same as
+ * the page size, so read_start and read_end cover full pages.
+ *
+ * Compare these to the original required index and length to
+ * only cache pages which were requested partially, since these
+ * are the ones which are likely to be needed when reading
+ * adjacent blocks.
+ */
+ if (idx == 0 && index != read_start)
+ head_to_cache = page;
+ else if (idx == page_count - 1 && index + length != read_end)
+ tail_to_cache = page;
+
+ if (!bio || idx != end_idx) {
+ struct bio *new = bio_alloc_clone(bdev, fullbio,
+ GFP_NOIO, &fs_bio_set);
+
+ if (bio) {
+ bio_trim(bio, start_idx * PAGE_SECTORS,
+ (end_idx - start_idx) * PAGE_SECTORS);
+ bio_chain(bio, new);
+ submit_bio(bio);
+ }
+
+ bio = new;
+ start_idx = idx;
+ }
+
+ idx++;
+ end_idx = idx;
+ }
+
+ if (bio) {
+ bio_trim(bio, start_idx * PAGE_SECTORS,
+ (end_idx - start_idx) * PAGE_SECTORS);
+ err = submit_bio_wait(bio);
+ bio_put(bio);
+ }
+
+ if (err)
+ return err;
+
+ if (head_to_cache) {
+ int ret = add_to_page_cache_lru(head_to_cache, cache_mapping,
+ read_start, GFP_NOIO);
+
+ if (!ret) {
+ SetPageUptodate(head_to_cache);
+ unlock_page(head_to_cache);
+ }
+
+ }
+
+ if (tail_to_cache) {
+ int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping,
+ read_end - PAGE_SIZE, GFP_NOIO);
+
+ if (!ret) {
+ SetPageUptodate(tail_to_cache);
+ unlock_page(tail_to_cache);
+ }
+ }
+
+ return 0;
+}
+
static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
struct bio **biop, int *block_offset)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
+ struct inode *cache_inode = msblk->cache_inode;
+ struct address_space *cache_mapping = cache_inode ? cache_inode->i_mapping : NULL;
const u64 read_start = round_down(index, msblk->devblksize);
const sector_t block = read_start >> msblk->devblksize_log2;
const u64 read_end = round_up(index + length, msblk->devblksize);
@@ -99,13 +189,27 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
for (i = 0; i < page_count; ++i) {
unsigned int len =
min_t(unsigned int, PAGE_SIZE - offset, total_len);
- struct page *page = alloc_page(GFP_NOIO);
+ struct page *page = NULL;
+
+ if (cache_mapping)
+ page = find_get_page(cache_mapping,
+ read_start + i * PAGE_SIZE);
+ if (!page)
+ page = alloc_page(GFP_NOIO);
if (!page) {
error = -ENOMEM;
goto out_free_bio;
}
- if (!bio_add_page(bio, page, len, offset)) {
+
+ if (cache_mapping) {
+ /*
+ * Use the __ version to avoid merging since we need
+ * each page to be separate when we check for and avoid
+ * cached pages.
+ */
+ __bio_add_page(bio, page, len, offset);
+ } else if (!bio_add_page(bio, page, len, offset)) {
error = -EIO;
goto out_free_bio;
}
@@ -113,7 +217,12 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
total_len -= len;
}
- error = submit_bio_wait(bio);
+ if (cache_mapping)
+ error = squashfs_bio_read_cached(bio, cache_mapping, index,
+ length, read_start, read_end,
+ page_count);
+ else
+ error = submit_bio_wait(bio);
if (error)
goto out_free_bio;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 72f6f4b37863..dfee65845d48 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -47,6 +47,7 @@ struct squashfs_sb_info {
struct squashfs_cache *block_cache;
struct squashfs_cache *fragment_cache;
struct squashfs_cache *read_page;
+ struct inode *cache_inode;
int next_meta_index;
__le64 *id_table;
__le64 *fragment_index;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index e090fae48e68..64d6bc95950b 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -329,6 +329,16 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto failed_mount;
}
+ if (msblk->devblksize == PAGE_SIZE) {
+ msblk->cache_inode = new_inode(sb);
+ if (msblk->cache_inode == NULL)
+ goto failed_mount;
+
+ set_nlink(msblk->cache_inode, 1);
+ msblk->cache_inode->i_size = OFFSET_MAX;
+ mapping_set_gfp_mask(msblk->cache_inode->i_mapping, GFP_NOFS);
+ }
+
msblk->stream = squashfs_decompressor_setup(sb, flags);
if (IS_ERR(msblk->stream)) {
err = PTR_ERR(msblk->stream);
@@ -454,6 +464,7 @@ failed_mount:
squashfs_cache_delete(msblk->block_cache);
squashfs_cache_delete(msblk->fragment_cache);
squashfs_cache_delete(msblk->read_page);
+ iput(msblk->cache_inode);
msblk->thread_ops->destroy(msblk);
kfree(msblk->inode_lookup_table);
kfree(msblk->fragment_index);
@@ -572,6 +583,7 @@ static void squashfs_put_super(struct super_block *sb)
squashfs_cache_delete(sbi->block_cache);
squashfs_cache_delete(sbi->fragment_cache);
squashfs_cache_delete(sbi->read_page);
+ iput(sbi->cache_inode);
sbi->thread_ops->destroy(sbi);
kfree(sbi->id_table);
kfree(sbi->fragment_index);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 885f5395fcd0..567c547cf371 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -692,7 +692,6 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
*/
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
-void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
void cgroup_rstat_flush_release(void);
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 31f114f486c4..7af9949828ff 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -168,12 +168,6 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page,
}
#endif /* CONFIG_DMA_CMA*/
-#ifdef CONFIG_DMA_PERNUMA_CMA
-void dma_pernuma_cma_reserve(void);
-#else
-static inline void dma_pernuma_cma_reserve(void) { }
-#endif /* CONFIG_DMA_PERNUMA_CMA */
-
#ifdef CONFIG_DMA_DECLARE_COHERENT
int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_addr_t device_addr, size_t size);
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 481abf530b3c..6d5edef09d45 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -93,6 +93,15 @@ struct kmem_cache;
bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order);
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order);
+#else
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return false;
+}
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
int should_failslab(struct kmem_cache *s, gfp_t gfpflags);
#ifdef CONFIG_FAILSLAB
extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ed8cb537c6a7..665f06675c83 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -338,19 +338,12 @@ extern gfp_t gfp_allowed_mask;
/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
-extern void pm_restrict_gfp_mask(void);
-extern void pm_restore_gfp_mask(void);
-
-extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
-
-#ifdef CONFIG_PM_SLEEP
-extern bool pm_suspended_storage(void);
-#else
-static inline bool pm_suspended_storage(void)
+static inline bool gfp_has_io_fs(gfp_t gfp)
{
- return false;
+ return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS);
}
-#endif /* CONFIG_PM_SLEEP */
+
+extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6d041aa9f0fe..21f942025fec 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -133,9 +133,8 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
- struct page **, struct vm_area_struct **,
- unsigned long *, unsigned long *, long, unsigned int,
- int *);
+ struct page **, unsigned long *, unsigned long *,
+ long, unsigned int, int *);
void unmap_hugepage_range(struct vm_area_struct *,
unsigned long, unsigned long, struct page *,
zap_flags_t);
@@ -306,9 +305,8 @@ static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
static inline long follow_hugetlb_page(struct mm_struct *mm,
struct vm_area_struct *vma, struct page **pages,
- struct vm_area_struct **vmas, unsigned long *position,
- unsigned long *nr_pages, long i, unsigned int flags,
- int *nonblocking)
+ unsigned long *position, unsigned long *nr_pages,
+ long i, unsigned int flags, int *nonblocking)
{
BUG();
return 0;
@@ -757,14 +755,6 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
return folio->_hugetlb_subpool;
}
-/*
- * hugetlb page subpool pointer located in hpage[2].hugetlb_subpool
- */
-static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
-{
- return hugetlb_folio_subpool(page_folio(hpage));
-}
-
static inline void hugetlb_set_folio_subpool(struct folio *folio,
struct hugepage_subpool *subpool)
{
@@ -1031,11 +1021,6 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
return NULL;
}
-static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
-{
- return NULL;
-}
-
static inline int isolate_or_dissolve_huge_page(struct page *page,
struct list_head *list)
{
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index f7ef70661ce2..819b6bc8ac08 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -343,7 +343,7 @@ static inline void *kasan_reset_tag(const void *addr)
* @is_write: whether the bad access is a write or a read
* @ip: instruction pointer for the accessibility check or the bad access itself
*/
-bool kasan_report(unsigned long addr, size_t size,
+bool kasan_report(const void *addr, size_t size,
bool is_write, unsigned long ip);
#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 1fadb5f5978b..85559a34a098 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -455,7 +455,9 @@ void *mas_erase(struct ma_state *mas);
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
void mas_store_prealloc(struct ma_state *mas, void *entry);
void *mas_find(struct ma_state *mas, unsigned long max);
+void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
+void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
int mas_preallocate(struct ma_state *mas, gfp_t gfp);
bool mas_is_err(struct ma_state *mas);
@@ -466,7 +468,9 @@ void mas_destroy(struct ma_state *mas);
int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);
void *mas_prev(struct ma_state *mas, unsigned long min);
+void *mas_prev_range(struct ma_state *mas, unsigned long max);
void *mas_next(struct ma_state *mas, unsigned long max);
+void *mas_next_range(struct ma_state *mas, unsigned long max);
int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
unsigned long size);
@@ -482,13 +486,13 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
}
/* Checks if a mas has not found anything */
-static inline bool mas_is_none(struct ma_state *mas)
+static inline bool mas_is_none(const struct ma_state *mas)
{
return mas->node == MAS_NONE;
}
/* Checks if a mas has been paused */
-static inline bool mas_is_paused(struct ma_state *mas)
+static inline bool mas_is_paused(const struct ma_state *mas)
{
return mas->node == MAS_PAUSE;
}
@@ -528,6 +532,19 @@ static inline void mas_reset(struct ma_state *mas)
#define mas_for_each(__mas, __entry, __max) \
while (((__entry) = mas_find((__mas), (__max))) != NULL)
+/**
+ * mas_contiguous() - Iterate over a contiguous range of the maple tree.
+ * @__mas: Maple Tree operation state (maple_state)
+ * @__entry: Entry retrieved from the tree
+ * @__max: maximum index to retrieve from the tree
+ *
+ * When returned, mas->index and mas->last will hold the entire range of the
+ * entry. The loop will terminate on the first NULL encountered.
+ *
+ * Note: may return the zero entry.
+ */
+#define mas_contiguous(__mas, __entry, __max) \
+ while (((__entry) = mas_find_range((__mas), (__max))) != NULL)
/**
* mas_set_range() - Set up Maple Tree operation state for a different index.
@@ -616,7 +633,7 @@ static inline void mt_clear_in_rcu(struct maple_tree *mt)
return;
if (mt_external_lock(mt)) {
- BUG_ON(!mt_lock_is_held(mt));
+ WARN_ON(!mt_lock_is_held(mt));
mt->ma_flags &= ~MT_FLAGS_USE_RCU;
} else {
mtree_lock(mt);
@@ -635,7 +652,7 @@ static inline void mt_set_in_rcu(struct maple_tree *mt)
return;
if (mt_external_lock(mt)) {
- BUG_ON(!mt_lock_is_held(mt));
+ WARN_ON(!mt_lock_is_held(mt));
mt->ma_flags |= MT_FLAGS_USE_RCU;
} else {
mtree_lock(mt);
@@ -670,10 +687,17 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
#ifdef CONFIG_DEBUG_MAPLE_TREE
+enum mt_dump_format {
+ mt_dump_dec,
+ mt_dump_hex,
+};
+
extern atomic_t maple_tree_tests_run;
extern atomic_t maple_tree_tests_passed;
-void mt_dump(const struct maple_tree *mt);
+void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
+void mas_dump(const struct ma_state *mas);
+void mas_wr_dump(const struct ma_wr_state *wr_mas);
void mt_validate(struct maple_tree *mt);
void mt_cache_shrink(void);
#define MT_BUG_ON(__tree, __x) do { \
@@ -681,7 +705,40 @@ void mt_cache_shrink(void);
if (__x) { \
pr_info("BUG at %s:%d (%u)\n", \
__func__, __LINE__, __x); \
- mt_dump(__tree); \
+ mt_dump(__tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+
+#define MAS_BUG_ON(__mas, __x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (__x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mas_dump(__mas); \
+ mt_dump((__mas)->tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+
+#define MAS_WR_BUG_ON(__wrmas, __x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (__x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mas_wr_dump(__wrmas); \
+ mas_dump((__wrmas)->mas); \
+ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \
pr_info("Pass: %u Run:%u\n", \
atomic_read(&maple_tree_tests_passed), \
atomic_read(&maple_tree_tests_run)); \
@@ -690,8 +747,67 @@ void mt_cache_shrink(void);
atomic_inc(&maple_tree_tests_passed); \
} \
} while (0)
+
+#define MT_WARN_ON(__tree, __x) ({ \
+ int ret = !!(__x); \
+ atomic_inc(&maple_tree_tests_run); \
+ if (ret) { \
+ pr_info("WARN at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mt_dump(__tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+ unlikely(ret); \
+})
+
+#define MAS_WARN_ON(__mas, __x) ({ \
+ int ret = !!(__x); \
+ atomic_inc(&maple_tree_tests_run); \
+ if (ret) { \
+ pr_info("WARN at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mas_dump(__mas); \
+ mt_dump((__mas)->tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+ unlikely(ret); \
+})
+
+#define MAS_WR_WARN_ON(__wrmas, __x) ({ \
+ int ret = !!(__x); \
+ atomic_inc(&maple_tree_tests_run); \
+ if (ret) { \
+ pr_info("WARN at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mas_wr_dump(__wrmas); \
+ mas_dump((__wrmas)->mas); \
+ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+ unlikely(ret); \
+})
#else
-#define MT_BUG_ON(__tree, __x) BUG_ON(__x)
+#define MT_BUG_ON(__tree, __x) BUG_ON(__x)
+#define MAS_BUG_ON(__mas, __x) BUG_ON(__x)
+#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x)
+#define MT_WARN_ON(__tree, __x) WARN_ON(__x)
+#define MAS_WARN_ON(__mas, __x) WARN_ON(__x)
+#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x)
#endif /* CONFIG_DEBUG_MAPLE_TREE */
#endif /*_LINUX_MAPLE_TREE_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 222d7370134c..00a88cf947e1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1038,7 +1038,6 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
}
void mem_cgroup_flush_stats(void);
-void mem_cgroup_flush_stats_atomic(void);
void mem_cgroup_flush_stats_ratelimited(void);
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@ -1537,10 +1536,6 @@ static inline void mem_cgroup_flush_stats(void)
{
}
-static inline void mem_cgroup_flush_stats_atomic(void)
-{
-}
-
static inline void mem_cgroup_flush_stats_ratelimited(void)
{
}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 9fcbf5706595..04bc286eed42 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -326,9 +326,6 @@ static inline int remove_memory(u64 start, u64 size)
static inline void __remove_memory(u64 start, u64 size) {}
#endif /* CONFIG_MEMORY_HOTREMOVE */
-extern void set_zone_contiguous(struct zone *zone);
-extern void clear_zone_contiguous(struct zone *zone);
-
#ifdef CONFIG_MEMORY_HOTPLUG
extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat);
extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27ce77080c79..66032f0d515c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -866,11 +866,24 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
return mas_find(&vmi->mas, ULONG_MAX);
}
+static inline
+struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
+{
+ return mas_next_range(&vmi->mas, ULONG_MAX);
+}
+
+
static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
{
return mas_prev(&vmi->mas, 0);
}
+static inline
+struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
+{
+ return mas_prev_range(&vmi->mas, 0);
+}
+
static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
{
return vmi->mas.index;
@@ -2353,6 +2366,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
unmap_mapping_range(mapping, holebegin, holelen, 0);
}
+static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm,
+ unsigned long addr);
+
extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
@@ -2361,19 +2377,42 @@ extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
long get_user_pages_remote(struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked);
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked);
+ int *locked);
+
+static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
+ unsigned long addr,
+ int gup_flags,
+ struct vm_area_struct **vmap)
+{
+ struct page *page;
+ struct vm_area_struct *vma;
+ int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
+
+ if (got < 0)
+ return ERR_PTR(got);
+ if (got == 0)
+ return NULL;
+
+ vma = vma_lookup(mm, addr);
+ if (WARN_ON_ONCE(!vma)) {
+ put_page(page);
+ return ERR_PTR(-EINVAL);
+ }
+
+ *vmap = vma;
+ return page;
+}
+
long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas);
+ unsigned int gup_flags, struct page **pages);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas);
+ unsigned int gup_flags, struct page **pages);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
@@ -2422,6 +2461,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
+bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
@@ -2994,12 +3034,6 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif
extern void set_dma_reserve(unsigned long new_dma_reserve);
-extern void memmap_init_range(unsigned long, int, unsigned long,
- unsigned long, unsigned long, enum meminit_context,
- struct vmem_altmap *, int migratetype);
-extern void setup_per_zone_wmarks(void);
-extern void calculate_min_free_kbytes(void);
-extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
@@ -3020,11 +3054,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
extern void setup_per_cpu_pageset(void);
-/* page_alloc.c */
-extern int min_free_kbytes;
-extern int watermark_boost_factor;
-extern int watermark_scale_factor;
-
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
@@ -3471,9 +3500,58 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 0);
}
+
+extern unsigned int _debug_guardpage_minorder;
+DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
+
+static inline unsigned int debug_guardpage_minorder(void)
+{
+ return _debug_guardpage_minorder;
+}
+
+static inline bool debug_guardpage_enabled(void)
+{
+ return static_branch_unlikely(&_debug_guardpage_enabled);
+}
+
+static inline bool page_is_guard(struct page *page)
+{
+ if (!debug_guardpage_enabled())
+ return false;
+
+ return PageGuard(page);
+}
+
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype);
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype)
+{
+ if (!debug_guardpage_enabled())
+ return false;
+ return __set_page_guard(zone, page, order, migratetype);
+}
+
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype);
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype)
+{
+ if (!debug_guardpage_enabled())
+ return;
+ __clear_page_guard(zone, page, order, migratetype);
+}
+
#else /* CONFIG_DEBUG_PAGEALLOC */
static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
+static inline unsigned int debug_guardpage_minorder(void) { return 0; }
+static inline bool debug_guardpage_enabled(void) { return false; }
+static inline bool page_is_guard(struct page *page) { return false; }
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) { return false; }
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) {}
#endif /* CONFIG_DEBUG_PAGEALLOC */
#ifdef __HAVE_ARCH_GATE_AREA
@@ -3586,6 +3664,10 @@ extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Sysfs entries for memory failure handling statistics.
+ */
+extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
@@ -3678,11 +3760,6 @@ enum mf_action_page_type {
MF_MSG_UNKNOWN,
};
-/*
- * Sysfs entries for memory failure handling statistics.
- */
-extern const struct attribute_group memory_failure_attr_group;
-
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
@@ -3712,33 +3789,6 @@ static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-extern unsigned int _debug_guardpage_minorder;
-DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
-
-static inline unsigned int debug_guardpage_minorder(void)
-{
- return _debug_guardpage_minorder;
-}
-
-static inline bool debug_guardpage_enabled(void)
-{
- return static_branch_unlikely(&_debug_guardpage_enabled);
-}
-
-static inline bool page_is_guard(struct page *page)
-{
- if (!debug_guardpage_enabled())
- return false;
-
- return PageGuard(page);
-}
-#else
-static inline unsigned int debug_guardpage_minorder(void) { return 0; }
-static inline bool debug_guardpage_enabled(void) { return false; }
-static inline bool page_is_guard(struct page *page) { return false; }
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index b8728d11c949..7c3e7b0b0e8f 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -8,10 +8,12 @@
struct page;
struct vm_area_struct;
struct mm_struct;
+struct vma_iterator;
void dump_page(struct page *page, const char *reason);
void dump_vma(const struct vm_area_struct *vma);
void dump_mm(const struct mm_struct *mm);
+void vma_iter_dump_tree(const struct vma_iterator *vmi);
#ifdef CONFIG_DEBUG_VM
#define VM_BUG_ON(cond) BUG_ON(cond)
@@ -74,6 +76,17 @@ void dump_mm(const struct mm_struct *mm);
} \
unlikely(__ret_warn_once); \
})
+#define VM_WARN_ON_ONCE_MM(cond, mm) ({ \
+ static bool __section(".data.once") __warned; \
+ int __ret_warn_once = !!(cond); \
+ \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ dump_mm(mm); \
+ __warned = true; \
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn_once); \
+})
#define VM_WARN_ON(cond) (void)WARN_ON(cond)
#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
@@ -90,6 +103,7 @@ void dump_mm(const struct mm_struct *mm);
#define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4889c9d4055..3a68326c9989 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1512,27 +1512,6 @@ static inline bool has_managed_dma(void)
}
#endif
-/* These two functions are used to setup the per zone pages min values */
-struct ctl_table;
-
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *,
- loff_t *);
-int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
- size_t *, loff_t *);
-extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
- size_t *, loff_t *);
-int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int,
- void *, size_t *, loff_t *);
-int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
- void *, size_t *, loff_t *);
-int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
- void *, size_t *, loff_t *);
-int numa_zonelist_order_handler(struct ctl_table *, int,
- void *, size_t *, loff_t *);
-extern int percpu_pagelist_high_fraction;
-extern char numa_zonelist_order[];
-#define NUMA_ZONELIST_ORDER_LEN 16
#ifndef CONFIG_NUMA
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a56308a9d1a4..c1ae5ebc375f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1078,8 +1078,6 @@ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst,
#else
#define filemap_migrate_folio NULL
#endif
-void page_endio(struct page *page, bool is_write, int err);
-
void folio_end_private_2(struct folio *folio);
void folio_wait_private_2(struct folio *folio);
int folio_wait_private_2_killable(struct folio *folio);
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index e58306783d8e..8bbd1639ae85 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -94,6 +94,8 @@ static inline int stack_depot_early_init(void) { return 0; }
depot_stack_handle_t __stack_depot_save(unsigned long *entries,
unsigned int nr_entries,
gfp_t gfp_flags, bool can_alloc);
+void stack_depot_inc_count(depot_stack_handle_t handle);
+void stack_depot_dec_count(depot_stack_handle_t handle);
/**
* stack_depot_save - Save a stack trace to stack depot
@@ -110,6 +112,13 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries,
depot_stack_handle_t stack_depot_save(unsigned long *entries,
unsigned int nr_entries, gfp_t gfp_flags);
+#ifdef CONFIG_PAGE_OWNER
+struct seq_file;
+void *stack_start(struct seq_file *m, loff_t *ppos);
+void *stack_next(struct seq_file *m, void *v, loff_t *ppos);
+int stack_print(struct seq_file *m, void *v);
+#endif
+
/**
* stack_depot_fetch - Fetch a stack trace from stack depot
*
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d0d4598a7b3f..76923051c03d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -364,9 +364,6 @@ struct pbe {
struct pbe *next;
};
-/* mm/page_alloc.c */
-extern void mark_free_pages(struct zone *zone);
-
/**
* struct platform_hibernation_ops - hibernation platform support
*
@@ -505,6 +502,11 @@ extern void pm_report_max_hw_sleep(u64 t);
extern bool events_check_enabled;
extern suspend_state_t pm_suspend_target_state;
+static inline bool pm_suspended_storage(void)
+{
+ return !gfp_has_io_fs(gfp_allowed_mask);
+}
+
extern bool pm_wakeup_pending(void);
extern void pm_system_wakeup(void);
extern void pm_system_cancel_wakeup(void);
@@ -538,6 +540,7 @@ static inline void ksys_sync_helper(void) {}
#define pm_notifier(fn, pri) do { (void)(fn); } while (0)
+static inline bool pm_suspended_storage(void) { return false; }
static inline bool pm_wakeup_pending(void) { return false; }
static inline void pm_system_wakeup(void) {}
static inline void pm_wakeup_clear(bool reset) {}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3c69cb653cb9..b2128df5edea 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -368,6 +368,7 @@ static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry)
}
/* linux/mm/workingset.c */
+bool workingset_test_recent(void *shadow, bool file, bool *workingset);
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 33a0ee3bcb2e..6648c07c4381 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -72,6 +72,8 @@ struct open_how;
struct mount_attr;
struct landlock_ruleset_attr;
enum landlock_rule_type;
+struct cachestat_range;
+struct cachestat;
#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -1058,6 +1060,9 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
unsigned long home_node,
unsigned long flags);
+asmlinkage long sys_cachestat(unsigned int fd,
+ struct cachestat_range __user *cstat_range,
+ struct cachestat __user *cstat, unsigned int flags);
/*
* Architecture-specific system calls
diff --git a/include/linux/types.h b/include/linux/types.h
index 688fb943556a..7e5a1fb7cfa1 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -35,6 +35,7 @@ typedef __kernel_uid16_t uid16_t;
typedef __kernel_gid16_t gid16_t;
typedef unsigned long uintptr_t;
+typedef long intptr_t;
#ifdef CONFIG_HAVE_UID16
/* This is defined by include/asm-{arch}/posix_types.h */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 45fa180cc56a..cd639fae9086 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
#define __NR_set_mempolicy_home_node 450
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
+#define __NR_cachestat 451
+__SYSCALL(__NR_cachestat, sys_cachestat)
+
#undef __NR_syscalls
-#define __NR_syscalls 451
+#define __NR_syscalls 452
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index f55bc680b5b0..a246e11988d5 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -4,6 +4,7 @@
#include <asm/mman.h>
#include <asm-generic/hugetlb_encode.h>
+#include <linux/types.h>
#define MREMAP_MAYMOVE 1
#define MREMAP_FIXED 2
@@ -41,4 +42,17 @@
#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
+struct cachestat_range {
+ __u64 off;
+ __u64 len;
+};
+
+struct cachestat {
+ __u64 nr_cache;
+ __u64 nr_dirty;
+ __u64 nr_writeback;
+ __u64 nr_evicted;
+ __u64 nr_recently_evicted;
+};
+
#endif /* _UAPI_LINUX_MMAN_H */
diff --git a/init/Kconfig b/init/Kconfig
index 32c24950c4ce..f7f65af4ee12 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1771,6 +1771,16 @@ config RSEQ
If unsure, say Y.
+config CACHESTAT_SYSCALL
+ bool "Enable cachestat() system call" if EXPERT
+ default y
+ help
+ Enable the cachestat system call, which queries the page cache
+ statistics of a file (number of cached pages, dirty pages,
+ pages marked for writeback, (recently) evicted pages).
+
+ If unsure say Y here.
+
config DEBUG_RSEQ
default n
bool "Enabled debugging of rseq() system call" if EXPERT
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d46f72a5ef73..b56bda46a9eb 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1030,9 +1030,8 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
{
unsigned long start, end, nr_pages;
- struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
- int i, pret, ret = -ENOMEM;
+ int pret, ret = -ENOMEM;
end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ubuf >> PAGE_SHIFT;
@@ -1042,45 +1041,24 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
if (!pages)
goto done;
- vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas)
- goto done;
-
ret = 0;
mmap_read_lock(current->mm);
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- pages, vmas);
- if (pret == nr_pages) {
- /* don't support file backed memory */
- for (i = 0; i < nr_pages; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma_is_shmem(vma))
- continue;
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
- ret = -EOPNOTSUPP;
- break;
- }
- }
+ pages);
+ if (pret == nr_pages)
*npages = nr_pages;
- } else {
+ else
ret = pret < 0 ? pret : -EFAULT;
- }
+
mmap_read_unlock(current->mm);
if (ret) {
- /*
- * if we did partial map, or found file backed vmas,
- * release any pages we did get
- */
+ /* if we did partial map, release any pages we did get */
if (pret > 0)
unpin_user_pages(pages, pret);
goto done;
}
ret = 0;
done:
- kvfree(vmas);
if (ret < 0) {
kvfree(pages);
pages = ERR_PTR(ret);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9c4c55228567..2542c21b6b6d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -171,7 +171,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__diag_pop();
/* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
int cpu;
@@ -207,9 +207,8 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
- /* if @may_sleep, play nice and yield if necessary */
- if (may_sleep && (need_resched() ||
- spin_needbreak(&cgroup_rstat_lock))) {
+ /* play nice and yield if necessary */
+ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
spin_unlock_irq(&cgroup_rstat_lock);
if (!cond_resched())
cpu_relax();
@@ -236,26 +235,11 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
+ cgroup_rstat_flush_locked(cgrp);
spin_unlock_irq(&cgroup_rstat_lock);
}
/**
- * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush()
- * @cgrp: target cgroup
- *
- * This function can be called from any context.
- */
-void cgroup_rstat_flush_atomic(struct cgroup *cgrp)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cgroup_rstat_lock, flags);
- cgroup_rstat_flush_locked(cgrp, false);
- spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
-}
-
-/**
* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
@@ -269,7 +253,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
{
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
+ cgroup_rstat_flush_locked(cgrp);
}
/**
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 6677d0e64d27..79f83091e3a2 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -140,10 +140,10 @@ if DMA_CMA
config DMA_PERNUMA_CMA
bool "Enable separate DMA Contiguous Memory Area for each NUMA Node"
- default NUMA && ARM64
+ default NUMA
help
- Enable this option to get pernuma CMA areas so that devices like
- ARM64 SMMU can get local memory by DMA coherent APIs.
+ Enable this option to get pernuma CMA areas so that NUMA devices
+ can get local memory by DMA coherent APIs.
You can set the size of pernuma CMA by specifying "cma_pernuma=size"
on the kernel's command line.
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 6ea80ae42622..26a8e5365fcd 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -128,7 +128,7 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
#endif
#ifdef CONFIG_DMA_PERNUMA_CMA
-void __init dma_pernuma_cma_reserve(void)
+static void __init dma_pernuma_cma_reserve(void)
{
int nid;
@@ -153,6 +153,10 @@ void __init dma_pernuma_cma_reserve(void)
(unsigned long long)pernuma_size_bytes / SZ_1M, nid);
}
}
+#else
+static inline void __init dma_pernuma_cma_reserve(void)
+{
+}
#endif
/**
@@ -171,6 +175,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
phys_addr_t selected_limit = limit;
bool fixed = false;
+ dma_pernuma_cma_reserve();
+
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
if (size_cmdline != -1) {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 59887c69d54c..cac3aef7c6f7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -365,7 +365,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
void *kaddr;
struct page *page;
- struct vm_area_struct *vma;
int ret;
short *ptr;
@@ -373,7 +372,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
return -EINVAL;
ret = get_user_pages_remote(mm, vaddr, 1,
- FOLL_WRITE, &page, &vma, NULL);
+ FOLL_WRITE, &page, NULL);
if (unlikely(ret <= 0)) {
/*
* We are asking for 1 page. If get_user_pages_remote() fails,
@@ -474,10 +473,9 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
- &old_page, &vma, NULL);
- if (ret <= 0)
- return ret;
+ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR_OR_NULL(old_page))
+ return PTR_ERR(old_page);
ret = verify_opcode(old_page, vaddr, &opcode);
if (ret <= 0)
@@ -2027,8 +2025,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
- NULL, NULL);
+ result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
if (result < 0)
return result;
diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..d17995934eb4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -252,23 +252,19 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
int i;
int ret;
+ int nr_charged = 0;
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
if (ret)
goto err;
+ nr_charged++;
}
return 0;
err:
- /*
- * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
- * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
- * ignore this page.
- */
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ for (i = 0; i < nr_charged; i++)
memcg_kmem_uncharge_page(vm->pages[i], 0);
return ret;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490792b1066e..08455fb48ba6 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -312,10 +312,10 @@ void __noreturn kthread_exit(long result)
* @comp: Completion to complete
* @code: The integer value to return to kthread_stop().
*
- * If present complete @comp and the reuturn code to kthread_stop().
+ * If present, complete @comp and then return code to kthread_stop().
*
* A kernel thread whose module may be removed after the completion of
- * @comp can use this function exit safely.
+ * @comp can use this function to exit safely.
*
* Does not return.
*/
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3113ec2f1db4..34fc8359145b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,33 @@
#include "power.h"
#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
+ */
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
+}
+
+void pm_restrict_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
+}
unsigned int lock_system_sleep(void)
{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b83c8d5e188d..ac14d1b463d1 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -216,6 +216,11 @@ static inline void suspend_test_finish(const char *label) {}
/* kernel/power/main.c */
extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
extern int pm_notifier_call_chain(unsigned long val);
+void pm_restrict_gfp_mask(void);
+void pm_restore_gfp_mask(void);
+#else
+static inline void pm_restrict_gfp_mask(void) {}
+static inline void pm_restore_gfp_mask(void) {}
#endif
#ifdef CONFIG_HIGHMEM
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cd8b7b35f1e8..45ef0bf81c85 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1228,6 +1228,58 @@ unsigned int snapshot_additional_pages(struct zone *zone)
return 2 * rtree;
}
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
+static void mark_free_pages(struct zone *zone)
+{
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
+ unsigned long flags;
+ unsigned int order, t;
+ struct page *page;
+
+ if (zone_is_empty(zone))
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
+ if (page_zone(page) != zone)
+ continue;
+
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
+ }
+
+ for_each_migratetype_order(order, t) {
+ list_for_each_entry(page,
+ &zone->free_area[order].free_list[t], buddy_list) {
+ unsigned long i;
+
+ pfn = page_to_pfn(page);
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+ swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
#ifdef CONFIG_HIGHMEM
/**
* count_free_highmem_pages - Compute the total number of free highmem pages.
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..04bfb1e4d377 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy);
COND_SYSCALL(migrate_pages);
COND_SYSCALL(move_pages);
COND_SYSCALL(set_mempolicy_home_node);
+COND_SYSCALL(cachestat);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bfe53e835524..a57de67f032f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2120,13 +2120,6 @@ static struct ctl_table vm_table[] = {
},
#endif
{
- .procname = "lowmem_reserve_ratio",
- .data = &sysctl_lowmem_reserve_ratio,
- .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
- .mode = 0644,
- .proc_handler = lowmem_reserve_ratio_sysctl_handler,
- },
- {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
@@ -2136,39 +2129,6 @@ static struct ctl_table vm_table[] = {
.extra2 = SYSCTL_FOUR,
},
{
- .procname = "min_free_kbytes",
- .data = &min_free_kbytes,
- .maxlen = sizeof(min_free_kbytes),
- .mode = 0644,
- .proc_handler = min_free_kbytes_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "watermark_boost_factor",
- .data = &watermark_boost_factor,
- .maxlen = sizeof(watermark_boost_factor),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "watermark_scale_factor",
- .data = &watermark_scale_factor,
- .maxlen = sizeof(watermark_scale_factor),
- .mode = 0644,
- .proc_handler = watermark_scale_factor_sysctl_handler,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_THREE_THOUSAND,
- },
- {
- .procname = "percpu_pagelist_high_fraction",
- .data = &percpu_pagelist_high_fraction,
- .maxlen = sizeof(percpu_pagelist_high_fraction),
- .mode = 0644,
- .proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- },
- {
.procname = "page_lock_unfairness",
.data = &sysctl_page_lock_unfairness,
.maxlen = sizeof(sysctl_page_lock_unfairness),
@@ -2223,24 +2183,6 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
- {
- .procname = "min_unmapped_ratio",
- .data = &sysctl_min_unmapped_ratio,
- .maxlen = sizeof(sysctl_min_unmapped_ratio),
- .mode = 0644,
- .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "min_slab_ratio",
- .data = &sysctl_min_slab_ratio,
- .maxlen = sizeof(sysctl_min_slab_ratio),
- .mode = 0644,
- .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
#endif
#ifdef CONFIG_SMP
{
@@ -2267,15 +2209,6 @@ static struct ctl_table vm_table[] = {
.proc_handler = mmap_min_addr_handler,
},
#endif
-#ifdef CONFIG_NUMA
- {
- .procname = "numa_zonelist_order",
- .data = &numa_zonelist_order,
- .maxlen = NUMA_ZONELIST_ORDER_LEN,
- .mode = 0644,
- .proc_handler = numa_zonelist_order_handler,
- },
-#endif
#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index b1ecd7677642..bdc2666e8d39 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -406,7 +406,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
return -EBUSY;
ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
- &page, NULL, NULL);
+ &page, NULL);
if (unlikely(ret <= 0)) {
if (!fixup_fault)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ce51d4dc6803..f202648dead9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2302,9 +2302,13 @@ config TEST_XARRAY
tristate "Test the XArray code at runtime"
config TEST_MAPLE_TREE
- depends on DEBUG_KERNEL
- select DEBUG_MAPLE_TREE
- tristate "Test the Maple Tree code at runtime"
+ tristate "Test the Maple Tree code at runtime or module load"
+ help
+ Enable this option to test the maple tree code functions at boot, or
+ when the module is loaded. Enable "Debug Maple Trees" will enable
+ more verbose output on failures.
+
+ If unsure, say N.
config TEST_RHASHTABLE
tristate "Perform selftest on resizable hash table"
diff --git a/lib/Makefile b/lib/Makefile
index 876fcdeae34e..38f23f352736 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,7 +30,7 @@ endif
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o timerqueue.o xarray.o \
maple_tree.o idr.o extable.o irq_regs.o argv_split.o \
- flex_proportions.o ratelimit.o show_mem.o \
+ flex_proportions.o ratelimit.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
nmi_backtrace.o win_minmax.o memcat_p.o \
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 8ebc43d4cc8c..4eb220008f72 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -194,7 +194,7 @@ static void mas_set_height(struct ma_state *mas)
unsigned int new_flags = mas->tree->ma_flags;
new_flags &= ~MT_FLAGS_HEIGHT_MASK;
- BUG_ON(mas->depth > MAPLE_HEIGHT_MAX);
+ MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX);
new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET;
mas->tree->ma_flags = new_flags;
}
@@ -240,12 +240,12 @@ static inline void mas_set_err(struct ma_state *mas, long err)
mas->node = MA_ERROR(err);
}
-static inline bool mas_is_ptr(struct ma_state *mas)
+static inline bool mas_is_ptr(const struct ma_state *mas)
{
return mas->node == MAS_ROOT;
}
-static inline bool mas_is_start(struct ma_state *mas)
+static inline bool mas_is_start(const struct ma_state *mas)
{
return mas->node == MAS_START;
}
@@ -425,28 +425,26 @@ static inline unsigned long mte_parent_slot_mask(unsigned long parent)
}
/*
- * mas_parent_enum() - Return the maple_type of the parent from the stored
+ * mas_parent_type() - Return the maple_type of the parent from the stored
* parent type.
* @mas: The maple state
- * @node: The maple_enode to extract the parent's enum
+ * @enode: The maple_enode to extract the parent's enum
* Return: The node->parent maple_type
*/
static inline
-enum maple_type mte_parent_enum(struct maple_enode *p_enode,
- struct maple_tree *mt)
+enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode)
{
unsigned long p_type;
- p_type = (unsigned long)p_enode;
- if (p_type & MAPLE_PARENT_ROOT)
- return 0; /* Validated in the caller. */
+ p_type = (unsigned long)mte_to_node(enode)->parent;
+ if (WARN_ON(p_type & MAPLE_PARENT_ROOT))
+ return 0;
p_type &= MAPLE_NODE_MASK;
- p_type = p_type & ~(MAPLE_PARENT_ROOT | mte_parent_slot_mask(p_type));
-
+ p_type &= ~mte_parent_slot_mask(p_type);
switch (p_type) {
case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */
- if (mt_is_alloc(mt))
+ if (mt_is_alloc(mas->tree))
return maple_arange_64;
return maple_range_64;
}
@@ -454,14 +452,8 @@ enum maple_type mte_parent_enum(struct maple_enode *p_enode,
return 0;
}
-static inline
-enum maple_type mas_parent_enum(struct ma_state *mas, struct maple_enode *enode)
-{
- return mte_parent_enum(ma_enode_ptr(mte_to_node(enode)->parent), mas->tree);
-}
-
/*
- * mte_set_parent() - Set the parent node and encode the slot
+ * mas_set_parent() - Set the parent node and encode the slot
* @enode: The encoded maple node.
* @parent: The encoded maple node that is the parent of @enode.
* @slot: The slot that @enode resides in @parent.
@@ -470,16 +462,16 @@ enum maple_type mas_parent_enum(struct ma_state *mas, struct maple_enode *enode)
* parent type.
*/
static inline
-void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
- unsigned char slot)
+void mas_set_parent(struct ma_state *mas, struct maple_enode *enode,
+ const struct maple_enode *parent, unsigned char slot)
{
unsigned long val = (unsigned long)parent;
unsigned long shift;
unsigned long type;
enum maple_type p_type = mte_node_type(parent);
- BUG_ON(p_type == maple_dense);
- BUG_ON(p_type == maple_leaf_64);
+ MAS_BUG_ON(mas, p_type == maple_dense);
+ MAS_BUG_ON(mas, p_type == maple_leaf_64);
switch (p_type) {
case maple_range_64:
@@ -671,22 +663,22 @@ static inline unsigned long *ma_gaps(struct maple_node *node,
}
/*
- * mte_pivot() - Get the pivot at @piv of the maple encoded node.
- * @mn: The maple encoded node.
+ * mas_pivot() - Get the pivot at @piv of the maple encoded node.
+ * @mas: The maple state.
* @piv: The pivot.
*
* Return: the pivot at @piv of @mn.
*/
-static inline unsigned long mte_pivot(const struct maple_enode *mn,
- unsigned char piv)
+static inline unsigned long mas_pivot(struct ma_state *mas, unsigned char piv)
{
- struct maple_node *node = mte_to_node(mn);
- enum maple_type type = mte_node_type(mn);
+ struct maple_node *node = mas_mn(mas);
+ enum maple_type type = mte_node_type(mas->node);
- if (piv >= mt_pivots[type]) {
- WARN_ON(1);
+ if (MAS_WARN_ON(mas, piv >= mt_pivots[type])) {
+ mas_set_err(mas, -EIO);
return 0;
}
+
switch (type) {
case maple_arange_64:
return node->ma64.pivot[piv];
@@ -971,8 +963,6 @@ static inline unsigned char ma_meta_end(struct maple_node *mn,
static inline unsigned char ma_meta_gap(struct maple_node *mn,
enum maple_type mt)
{
- BUG_ON(mt != maple_arange_64);
-
return mn->ma64.meta.gap;
}
@@ -1111,7 +1101,6 @@ static int mas_ascend(struct ma_state *mas)
enum maple_type a_type;
unsigned long min, max;
unsigned long *pivots;
- unsigned char offset;
bool set_max = false, set_min = false;
a_node = mas_mn(mas);
@@ -1123,8 +1112,9 @@ static int mas_ascend(struct ma_state *mas)
p_node = mte_parent(mas->node);
if (unlikely(a_node == p_node))
return 1;
- a_type = mas_parent_enum(mas, mas->node);
- offset = mte_parent_slot(mas->node);
+
+ a_type = mas_parent_type(mas, mas->node);
+ mas->offset = mte_parent_slot(mas->node);
a_enode = mt_mk_node(p_node, a_type);
/* Check to make sure all parent information is still accurate */
@@ -1132,7 +1122,6 @@ static int mas_ascend(struct ma_state *mas)
return 1;
mas->node = a_enode;
- mas->offset = offset;
if (mte_is_root(a_enode)) {
mas->max = ULONG_MAX;
@@ -1140,11 +1129,17 @@ static int mas_ascend(struct ma_state *mas)
return 0;
}
+ if (!mas->min)
+ set_min = true;
+
+ if (mas->max == ULONG_MAX)
+ set_max = true;
+
min = 0;
max = ULONG_MAX;
do {
p_enode = a_enode;
- a_type = mas_parent_enum(mas, p_enode);
+ a_type = mas_parent_type(mas, p_enode);
a_node = mte_parent(p_enode);
a_slot = mte_parent_slot(p_enode);
a_enode = mt_mk_node(a_node, a_type);
@@ -1401,9 +1396,9 @@ static inline struct maple_enode *mas_start(struct ma_state *mas)
mas->min = 0;
mas->max = ULONG_MAX;
- mas->depth = 0;
retry:
+ mas->depth = 0;
root = mas_root(mas);
/* Tree with nodes */
if (likely(xa_is_node(root))) {
@@ -1631,6 +1626,7 @@ static inline unsigned long mas_max_gap(struct ma_state *mas)
return mas_leaf_max_gap(mas);
node = mas_mn(mas);
+ MAS_BUG_ON(mas, mt != maple_arange_64);
offset = ma_meta_gap(node, mt);
if (offset == MAPLE_ARANGE64_META_MAX)
return 0;
@@ -1659,11 +1655,12 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset,
enum maple_type pmt;
pnode = mte_parent(mas->node);
- pmt = mas_parent_enum(mas, mas->node);
+ pmt = mas_parent_type(mas, mas->node);
penode = mt_mk_node(pnode, pmt);
pgaps = ma_gaps(pnode, pmt);
ascend:
+ MAS_BUG_ON(mas, pmt != maple_arange_64);
meta_offset = ma_meta_gap(pnode, pmt);
if (meta_offset == MAPLE_ARANGE64_META_MAX)
meta_gap = 0;
@@ -1691,7 +1688,7 @@ ascend:
/* Go to the parent node. */
pnode = mte_parent(penode);
- pmt = mas_parent_enum(mas, penode);
+ pmt = mas_parent_type(mas, penode);
pgaps = ma_gaps(pnode, pmt);
offset = mte_parent_slot(penode);
penode = mt_mk_node(pnode, pmt);
@@ -1718,7 +1715,7 @@ static inline void mas_update_gap(struct ma_state *mas)
pslot = mte_parent_slot(mas->node);
p_gap = ma_gaps(mte_parent(mas->node),
- mas_parent_enum(mas, mas->node))[pslot];
+ mas_parent_type(mas, mas->node))[pslot];
if (p_gap != max_gap)
mas_parent_gap(mas, pslot, max_gap);
@@ -1743,7 +1740,7 @@ static inline void mas_adopt_children(struct ma_state *mas,
offset = ma_data_end(node, type, pivots, mas->max);
do {
child = mas_slot_locked(mas, slots, offset);
- mte_set_parent(child, parent, offset);
+ mas_set_parent(mas, child, parent, offset);
} while (offset--);
}
@@ -1767,7 +1764,7 @@ static inline void mas_replace(struct ma_state *mas, bool advanced)
} else {
offset = mte_parent_slot(mas->node);
slots = ma_slots(mte_parent(mas->node),
- mas_parent_enum(mas, mas->node));
+ mas_parent_type(mas, mas->node));
old_enode = mas_slot_locked(mas, slots, offset);
}
@@ -1943,8 +1940,9 @@ static inline int mab_calc_split(struct ma_state *mas,
* causes one node to be deficient.
* NOTE: mt_min_slots is 1 based, b_end and split are zero.
*/
- while (((bn->pivot[split] - min) < slot_count - 1) &&
- (split < slot_count - 1) && (b_end - split > slot_min))
+ while ((split < slot_count - 1) &&
+ ((bn->pivot[split] - min) < slot_count - 1) &&
+ (b_end - split > slot_min))
split++;
}
@@ -2347,7 +2345,8 @@ static inline void mas_topiary_range(struct ma_state *mas,
void __rcu **slots;
unsigned char offset;
- MT_BUG_ON(mas->tree, mte_is_leaf(mas->node));
+ MAS_BUG_ON(mas, mte_is_leaf(mas->node));
+
slots = ma_slots(mas_mn(mas), mte_node_type(mas->node));
for (offset = start; offset <= end; offset++) {
struct maple_enode *enode = mas_slot_locked(mas, slots, offset);
@@ -2707,9 +2706,9 @@ static inline void mas_set_split_parent(struct ma_state *mas,
return;
if ((*slot) <= split)
- mte_set_parent(mas->node, left, *slot);
+ mas_set_parent(mas, mas->node, left, *slot);
else if (right)
- mte_set_parent(mas->node, right, (*slot) - split - 1);
+ mas_set_parent(mas, mas->node, right, (*slot) - split - 1);
(*slot)++;
}
@@ -3106,12 +3105,12 @@ static int mas_spanning_rebalance(struct ma_state *mas,
mte_node_type(mast->orig_l->node));
mast->orig_l->depth++;
mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true);
- mte_set_parent(left, l_mas.node, slot);
+ mas_set_parent(mas, left, l_mas.node, slot);
if (middle)
- mte_set_parent(middle, l_mas.node, ++slot);
+ mas_set_parent(mas, middle, l_mas.node, ++slot);
if (right)
- mte_set_parent(right, l_mas.node, ++slot);
+ mas_set_parent(mas, right, l_mas.node, ++slot);
if (mas_is_root_limits(mast->l)) {
new_root:
@@ -3250,7 +3249,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end
l_mas.max = l_pivs[split];
mas->min = l_mas.max + 1;
eparent = mt_mk_node(mte_parent(l_mas.node),
- mas_parent_enum(&l_mas, l_mas.node));
+ mas_parent_type(&l_mas, l_mas.node));
tmp += end;
if (!in_rcu) {
unsigned char max_p = mt_pivots[mt];
@@ -3293,7 +3292,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end
/* replace parent. */
offset = mte_parent_slot(mas->node);
- mt = mas_parent_enum(&l_mas, l_mas.node);
+ mt = mas_parent_type(&l_mas, l_mas.node);
parent = mas_pop_node(mas);
slots = ma_slots(parent, mt);
pivs = ma_pivots(parent, mt);
@@ -3338,8 +3337,8 @@ static inline bool mas_split_final_node(struct maple_subtree_state *mast,
* The Big_node data should just fit in a single node.
*/
ancestor = mas_new_ma_node(mas, mast->bn);
- mte_set_parent(mast->l->node, ancestor, mast->l->offset);
- mte_set_parent(mast->r->node, ancestor, mast->r->offset);
+ mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset);
+ mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset);
mte_to_node(ancestor)->parent = mas_mn(mas)->parent;
mast->l->node = ancestor;
@@ -4263,11 +4262,13 @@ done:
static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
{
- while ((wr_mas->mas->last > wr_mas->end_piv) &&
- (wr_mas->offset_end < wr_mas->node_end))
- wr_mas->end_piv = wr_mas->pivots[++wr_mas->offset_end];
+ while ((wr_mas->offset_end < wr_mas->node_end) &&
+ (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
+ wr_mas->offset_end++;
- if (wr_mas->mas->last > wr_mas->end_piv)
+ if (wr_mas->offset_end < wr_mas->node_end)
+ wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
+ else
wr_mas->end_piv = wr_mas->mas->max;
}
@@ -4424,7 +4425,6 @@ static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas)
}
/* At this point, we are at the leaf node that needs to be altered. */
- wr_mas->end_piv = wr_mas->r_max;
mas_wr_end_piv(wr_mas);
if (!wr_mas->entry)
@@ -4498,6 +4498,25 @@ exists:
}
+static inline void mas_rewalk(struct ma_state *mas, unsigned long index)
+{
+retry:
+ mas_set(mas, index);
+ mas_state_walk(mas);
+ if (mas_is_start(mas))
+ goto retry;
+}
+
+static inline bool mas_rewalk_if_dead(struct ma_state *mas,
+ struct maple_node *node, const unsigned long index)
+{
+ if (unlikely(ma_dead_node(node))) {
+ mas_rewalk(mas, index);
+ return true;
+ }
+ return false;
+}
+
/*
* mas_prev_node() - Find the prev non-null entry at the same level in the
* tree. The prev value will be mas->node[mas->offset] or MAS_NONE.
@@ -4513,15 +4532,19 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
int offset, level;
void __rcu **slots;
struct maple_node *node;
- struct maple_enode *enode;
unsigned long *pivots;
+ unsigned long max;
- if (mas_is_none(mas))
- return 0;
+ node = mas_mn(mas);
+ if (!mas->min)
+ goto no_entry;
+
+ max = mas->min - 1;
+ if (max < min)
+ goto no_entry;
level = 0;
do {
- node = mas_mn(mas);
if (ma_is_root(node))
goto no_entry;
@@ -4530,64 +4553,41 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
return 1;
offset = mas->offset;
level++;
+ node = mas_mn(mas);
} while (!offset);
offset--;
mt = mte_node_type(mas->node);
- node = mas_mn(mas);
- slots = ma_slots(node, mt);
- pivots = ma_pivots(node, mt);
- if (unlikely(ma_dead_node(node)))
- return 1;
-
- mas->max = pivots[offset];
- if (offset)
- mas->min = pivots[offset - 1] + 1;
- if (unlikely(ma_dead_node(node)))
- return 1;
-
- if (mas->max < min)
- goto no_entry_min;
-
while (level > 1) {
level--;
- enode = mas_slot(mas, slots, offset);
+ slots = ma_slots(node, mt);
+ mas->node = mas_slot(mas, slots, offset);
if (unlikely(ma_dead_node(node)))
return 1;
- mas->node = enode;
mt = mte_node_type(mas->node);
node = mas_mn(mas);
- slots = ma_slots(node, mt);
pivots = ma_pivots(node, mt);
- offset = ma_data_end(node, mt, pivots, mas->max);
+ offset = ma_data_end(node, mt, pivots, max);
if (unlikely(ma_dead_node(node)))
return 1;
-
- if (offset)
- mas->min = pivots[offset - 1] + 1;
-
- if (offset < mt_pivots[mt])
- mas->max = pivots[offset];
-
- if (mas->max < min)
- goto no_entry;
}
+ slots = ma_slots(node, mt);
mas->node = mas_slot(mas, slots, offset);
+ pivots = ma_pivots(node, mt);
if (unlikely(ma_dead_node(node)))
return 1;
+ if (likely(offset))
+ mas->min = pivots[offset - 1] + 1;
+ mas->max = max;
mas->offset = mas_data_end(mas);
if (unlikely(mte_dead_node(mas->node)))
return 1;
return 0;
-no_entry_min:
- mas->offset = offset;
- if (offset)
- mas->min = pivots[offset - 1] + 1;
no_entry:
if (unlikely(ma_dead_node(node)))
return 1;
@@ -4597,6 +4597,76 @@ no_entry:
}
/*
+ * mas_prev_slot() - Get the entry in the previous slot
+ *
+ * @mas: The maple state
+ * @max: The minimum starting range
+ *
+ * Return: The entry in the previous slot which is possibly NULL
+ */
+static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty)
+{
+ void *entry;
+ void __rcu **slots;
+ unsigned long pivot;
+ enum maple_type type;
+ unsigned long *pivots;
+ struct maple_node *node;
+ unsigned long save_point = mas->index;
+
+retry:
+ node = mas_mn(mas);
+ type = mte_node_type(mas->node);
+ pivots = ma_pivots(node, type);
+ if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
+ goto retry;
+
+again:
+ if (mas->min <= min) {
+ pivot = mas_safe_min(mas, pivots, mas->offset);
+
+ if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
+ goto retry;
+
+ if (pivot <= min)
+ return NULL;
+ }
+
+ if (likely(mas->offset)) {
+ mas->offset--;
+ mas->last = mas->index - 1;
+ mas->index = mas_safe_min(mas, pivots, mas->offset);
+ } else {
+ if (mas_prev_node(mas, min)) {
+ mas_rewalk(mas, save_point);
+ goto retry;
+ }
+
+ if (mas_is_none(mas))
+ return NULL;
+
+ mas->last = mas->max;
+ node = mas_mn(mas);
+ type = mte_node_type(mas->node);
+ pivots = ma_pivots(node, type);
+ mas->index = pivots[mas->offset - 1] + 1;
+ }
+
+ slots = ma_slots(node, type);
+ entry = mas_slot(mas, slots, mas->offset);
+ if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
+ goto retry;
+
+ if (likely(entry))
+ return entry;
+
+ if (!empty)
+ goto again;
+
+ return entry;
+}
+
+/*
* mas_next_node() - Get the next node at the same level in the tree.
* @mas: The maple state
* @max: The maximum pivot value to check.
@@ -4607,11 +4677,10 @@ no_entry:
static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
unsigned long max)
{
- unsigned long min, pivot;
+ unsigned long min;
unsigned long *pivots;
struct maple_enode *enode;
int level = 0;
- unsigned char offset;
unsigned char node_end;
enum maple_type mt;
void __rcu **slots;
@@ -4619,19 +4688,16 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
if (mas->max >= max)
goto no_entry;
+ min = mas->max + 1;
level = 0;
do {
if (ma_is_root(node))
goto no_entry;
- min = mas->max + 1;
- if (min > max)
- goto no_entry;
-
+ /* Walk up. */
if (unlikely(mas_ascend(mas)))
return 1;
- offset = mas->offset;
level++;
node = mas_mn(mas);
mt = mte_node_type(mas->node);
@@ -4640,36 +4706,37 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
if (unlikely(ma_dead_node(node)))
return 1;
- } while (unlikely(offset == node_end));
+ } while (unlikely(mas->offset == node_end));
slots = ma_slots(node, mt);
- pivot = mas_safe_pivot(mas, pivots, ++offset, mt);
- while (unlikely(level > 1)) {
- /* Descend, if necessary */
- enode = mas_slot(mas, slots, offset);
- if (unlikely(ma_dead_node(node)))
- return 1;
+ mas->offset++;
+ enode = mas_slot(mas, slots, mas->offset);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
- mas->node = enode;
+ if (level > 1)
+ mas->offset = 0;
+
+ while (unlikely(level > 1)) {
level--;
+ mas->node = enode;
node = mas_mn(mas);
mt = mte_node_type(mas->node);
slots = ma_slots(node, mt);
- pivots = ma_pivots(node, mt);
+ enode = mas_slot(mas, slots, 0);
if (unlikely(ma_dead_node(node)))
return 1;
-
- offset = 0;
- pivot = pivots[0];
}
- enode = mas_slot(mas, slots, offset);
+ if (!mas->offset)
+ pivots = ma_pivots(node, mt);
+
+ mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt);
if (unlikely(ma_dead_node(node)))
return 1;
mas->node = enode;
mas->min = min;
- mas->max = pivot;
return 0;
no_entry:
@@ -4681,92 +4748,88 @@ no_entry:
}
/*
- * mas_next_nentry() - Get the next node entry
- * @mas: The maple state
- * @max: The maximum value to check
- * @*range_start: Pointer to store the start of the range.
+ * mas_next_slot() - Get the entry in the next slot
*
- * Sets @mas->offset to the offset of the next node entry, @mas->last to the
- * pivot of the entry.
+ * @mas: The maple state
+ * @max: The maximum starting range
+ * @empty: Can be empty
*
- * Return: The next entry, %NULL otherwise
+ * Return: The entry in the next slot which is possibly NULL
*/
-static inline void *mas_next_nentry(struct ma_state *mas,
- struct maple_node *node, unsigned long max, enum maple_type type)
+static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty)
{
- unsigned char count;
- unsigned long pivot;
- unsigned long *pivots;
void __rcu **slots;
+ unsigned long *pivots;
+ unsigned long pivot;
+ enum maple_type type;
+ struct maple_node *node;
+ unsigned char data_end;
+ unsigned long save_point = mas->last;
void *entry;
- if (mas->last == mas->max) {
- mas->index = mas->max;
- return NULL;
- }
-
- slots = ma_slots(node, type);
+retry:
+ node = mas_mn(mas);
+ type = mte_node_type(mas->node);
pivots = ma_pivots(node, type);
- count = ma_data_end(node, type, pivots, mas->max);
- if (unlikely(ma_dead_node(node)))
- return NULL;
-
- mas->index = mas_safe_min(mas, pivots, mas->offset);
- if (unlikely(ma_dead_node(node)))
- return NULL;
-
- if (mas->index > max)
- return NULL;
-
- if (mas->offset > count)
- return NULL;
+ data_end = ma_data_end(node, type, pivots, mas->max);
+ if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
+ goto retry;
- while (mas->offset < count) {
- pivot = pivots[mas->offset];
- entry = mas_slot(mas, slots, mas->offset);
- if (ma_dead_node(node))
- return NULL;
+again:
+ if (mas->max >= max) {
+ if (likely(mas->offset < data_end))
+ pivot = pivots[mas->offset];
+ else
+ return NULL; /* must be mas->max */
- if (entry)
- goto found;
+ if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
+ goto retry;
if (pivot >= max)
return NULL;
+ }
- mas->index = pivot + 1;
+ if (likely(mas->offset < data_end)) {
+ mas->index = pivots[mas->offset] + 1;
mas->offset++;
- }
+ if (likely(mas->offset < data_end))
+ mas->last = pivots[mas->offset];
+ else
+ mas->last = mas->max;
+ } else {
+ if (mas_next_node(mas, node, max)) {
+ mas_rewalk(mas, save_point);
+ goto retry;
+ }
- if (mas->index > mas->max) {
- mas->index = mas->last;
- return NULL;
+ if (mas_is_none(mas))
+ return NULL;
+
+ mas->offset = 0;
+ mas->index = mas->min;
+ node = mas_mn(mas);
+ type = mte_node_type(mas->node);
+ pivots = ma_pivots(node, type);
+ mas->last = pivots[0];
}
- pivot = mas_safe_pivot(mas, pivots, mas->offset, type);
- entry = mas_slot(mas, slots, mas->offset);
- if (ma_dead_node(node))
- return NULL;
+ slots = ma_slots(node, type);
+ entry = mt_slot(mas->tree, slots, mas->offset);
+ if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
+ goto retry;
- if (!pivot)
- return NULL;
+ if (entry)
+ return entry;
- if (!entry)
- return NULL;
+ if (!empty) {
+ if (!mas->offset)
+ data_end = 2;
+ goto again;
+ }
-found:
- mas->last = pivot;
return entry;
}
-static inline void mas_rewalk(struct ma_state *mas, unsigned long index)
-{
-retry:
- mas_set(mas, index);
- mas_state_walk(mas);
- if (mas_is_start(mas))
- goto retry;
-}
-
/*
* mas_next_entry() - Internal function to get the next entry.
* @mas: The maple state
@@ -4781,155 +4844,10 @@ retry:
*/
static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit)
{
- void *entry = NULL;
- struct maple_enode *prev_node;
- struct maple_node *node;
- unsigned char offset;
- unsigned long last;
- enum maple_type mt;
-
- if (mas->index > limit) {
- mas->index = mas->last = limit;
- mas_pause(mas);
+ if (mas->last >= limit)
return NULL;
- }
- last = mas->last;
-retry:
- offset = mas->offset;
- prev_node = mas->node;
- node = mas_mn(mas);
- mt = mte_node_type(mas->node);
- mas->offset++;
- if (unlikely(mas->offset >= mt_slots[mt])) {
- mas->offset = mt_slots[mt] - 1;
- goto next_node;
- }
- while (!mas_is_none(mas)) {
- entry = mas_next_nentry(mas, node, limit, mt);
- if (unlikely(ma_dead_node(node))) {
- mas_rewalk(mas, last);
- goto retry;
- }
-
- if (likely(entry))
- return entry;
-
- if (unlikely((mas->index > limit)))
- break;
-
-next_node:
- prev_node = mas->node;
- offset = mas->offset;
- if (unlikely(mas_next_node(mas, node, limit))) {
- mas_rewalk(mas, last);
- goto retry;
- }
- mas->offset = 0;
- node = mas_mn(mas);
- mt = mte_node_type(mas->node);
- }
-
- mas->index = mas->last = limit;
- mas->offset = offset;
- mas->node = prev_node;
- return NULL;
-}
-
-/*
- * mas_prev_nentry() - Get the previous node entry.
- * @mas: The maple state.
- * @limit: The lower limit to check for a value.
- *
- * Return: the entry, %NULL otherwise.
- */
-static inline void *mas_prev_nentry(struct ma_state *mas, unsigned long limit,
- unsigned long index)
-{
- unsigned long pivot, min;
- unsigned char offset;
- struct maple_node *mn;
- enum maple_type mt;
- unsigned long *pivots;
- void __rcu **slots;
- void *entry;
-
-retry:
- if (!mas->offset)
- return NULL;
-
- mn = mas_mn(mas);
- mt = mte_node_type(mas->node);
- offset = mas->offset - 1;
- if (offset >= mt_slots[mt])
- offset = mt_slots[mt] - 1;
-
- slots = ma_slots(mn, mt);
- pivots = ma_pivots(mn, mt);
- if (unlikely(ma_dead_node(mn))) {
- mas_rewalk(mas, index);
- goto retry;
- }
-
- if (offset == mt_pivots[mt])
- pivot = mas->max;
- else
- pivot = pivots[offset];
-
- if (unlikely(ma_dead_node(mn))) {
- mas_rewalk(mas, index);
- goto retry;
- }
-
- while (offset && ((!mas_slot(mas, slots, offset) && pivot >= limit) ||
- !pivot))
- pivot = pivots[--offset];
-
- min = mas_safe_min(mas, pivots, offset);
- entry = mas_slot(mas, slots, offset);
- if (unlikely(ma_dead_node(mn))) {
- mas_rewalk(mas, index);
- goto retry;
- }
-
- if (likely(entry)) {
- mas->offset = offset;
- mas->last = pivot;
- mas->index = min;
- }
- return entry;
-}
-
-static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min)
-{
- void *entry;
-
- if (mas->index < min) {
- mas->index = mas->last = min;
- mas->node = MAS_NONE;
- return NULL;
- }
-retry:
- while (likely(!mas_is_none(mas))) {
- entry = mas_prev_nentry(mas, min, mas->index);
- if (unlikely(mas->last < min))
- goto not_found;
-
- if (likely(entry))
- return entry;
-
- if (unlikely(mas_prev_node(mas, min))) {
- mas_rewalk(mas, mas->index);
- goto retry;
- }
-
- mas->offset++;
- }
-
- mas->offset--;
-not_found:
- mas->index = mas->last = min;
- return NULL;
+ return mas_next_slot(mas, limit, false);
}
/*
@@ -5105,24 +5023,25 @@ void *mas_walk(struct ma_state *mas)
{
void *entry;
+ if (mas_is_none(mas) || mas_is_paused(mas) || mas_is_ptr(mas))
+ mas->node = MAS_START;
retry:
entry = mas_state_walk(mas);
- if (mas_is_start(mas))
+ if (mas_is_start(mas)) {
goto retry;
-
- if (mas_is_ptr(mas)) {
+ } else if (mas_is_none(mas)) {
+ mas->index = 0;
+ mas->last = ULONG_MAX;
+ } else if (mas_is_ptr(mas)) {
if (!mas->index) {
mas->last = 0;
- } else {
- mas->index = 1;
- mas->last = ULONG_MAX;
+ return entry;
}
- return entry;
- }
- if (mas_is_none(mas)) {
- mas->index = 0;
+ mas->index = 1;
mas->last = ULONG_MAX;
+ mas->node = MAS_NONE;
+ return NULL;
}
return entry;
@@ -5289,7 +5208,10 @@ int mas_empty_area(struct ma_state *mas, unsigned long min,
unsigned long *pivots;
enum maple_type mt;
- if (min >= max)
+ if (min > max)
+ return -EINVAL;
+
+ if (size == 0 || max - min < size - 1)
return -EINVAL;
if (mas_is_start(mas))
@@ -5338,7 +5260,10 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
{
struct maple_enode *last = mas->node;
- if (min >= max)
+ if (min > max)
+ return -EINVAL;
+
+ if (size == 0 || max - min < size - 1)
return -EINVAL;
if (mas_is_start(mas)) {
@@ -5374,7 +5299,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
return -EBUSY;
/* Trim the upper limit to the max. */
- if (max <= mas->last)
+ if (max < mas->last)
mas->last = max;
mas->index = mas->last - size + 1;
@@ -5394,8 +5319,8 @@ static inline int mas_alloc(struct ma_state *mas, void *entry,
return xa_err(mas->node);
if (!mas->index)
- return mte_pivot(mas->node, 0);
- return mte_pivot(mas->node, 1);
+ return mas_pivot(mas, 0);
+ return mas_pivot(mas, 1);
}
/* Must be walking a tree. */
@@ -5412,7 +5337,10 @@ static inline int mas_alloc(struct ma_state *mas, void *entry,
*/
min = mas->min;
if (mas->offset)
- min = mte_pivot(mas->node, mas->offset - 1) + 1;
+ min = mas_pivot(mas, mas->offset - 1) + 1;
+
+ if (mas_is_err(mas))
+ return xa_err(mas->node);
if (mas->index < min)
mas->index = min;
@@ -5694,9 +5622,9 @@ void *mas_store(struct ma_state *mas, void *entry)
trace_ma_write(__func__, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
- if (mas->index > mas->last)
- pr_err("Error %lu > %lu %p\n", mas->index, mas->last, entry);
- MT_BUG_ON(mas->tree, mas->index > mas->last);
+ if (MAS_WARN_ON(mas, mas->index > mas->last))
+ pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry);
+
if (mas->index > mas->last) {
mas_set_err(mas, -EINVAL);
return NULL;
@@ -5756,7 +5684,7 @@ void mas_store_prealloc(struct ma_state *mas, void *entry)
mas_wr_store_setup(&wr_mas);
trace_ma_write(__func__, mas, 0, entry);
mas_wr_store_entry(&wr_mas);
- BUG_ON(mas_is_err(mas));
+ MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
mas_destroy(mas);
}
EXPORT_SYMBOL_GPL(mas_store_prealloc);
@@ -5808,9 +5736,7 @@ void mas_destroy(struct ma_state *mas)
if (mas->mas_flags & MA_STATE_REBALANCE) {
unsigned char end;
- if (mas_is_start(mas))
- mas_start(mas);
-
+ mas_start(mas);
mtree_range_walk(mas);
end = mas_data_end(mas) + 1;
if (end < mt_min_slot_count(mas->node) - 1)
@@ -5900,6 +5826,34 @@ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries)
}
EXPORT_SYMBOL_GPL(mas_expected_entries);
+static inline bool mas_next_setup(struct ma_state *mas, unsigned long max,
+ void **entry)
+{
+ bool was_none = mas_is_none(mas);
+
+ if (mas_is_none(mas) || mas_is_paused(mas))
+ mas->node = MAS_START;
+
+ if (mas_is_start(mas))
+ *entry = mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
+
+ if (mas_is_ptr(mas)) {
+ *entry = NULL;
+ if (was_none && mas->index == 0) {
+ mas->index = mas->last = 0;
+ return true;
+ }
+ mas->index = 1;
+ mas->last = ULONG_MAX;
+ mas->node = MAS_NONE;
+ return true;
+ }
+
+ if (mas_is_none(mas))
+ return true;
+ return false;
+}
+
/**
* mas_next() - Get the next entry.
* @mas: The maple state
@@ -5913,27 +5867,38 @@ EXPORT_SYMBOL_GPL(mas_expected_entries);
*/
void *mas_next(struct ma_state *mas, unsigned long max)
{
- if (mas_is_none(mas) || mas_is_paused(mas))
- mas->node = MAS_START;
+ void *entry = NULL;
- if (mas_is_start(mas))
- mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
+ if (mas_next_setup(mas, max, &entry))
+ return entry;
- if (mas_is_ptr(mas)) {
- if (!mas->index) {
- mas->index = 1;
- mas->last = ULONG_MAX;
- }
- return NULL;
- }
+ /* Retries on dead nodes handled by mas_next_slot */
+ return mas_next_slot(mas, max, false);
+}
+EXPORT_SYMBOL_GPL(mas_next);
- if (mas->last == ULONG_MAX)
- return NULL;
+/**
+ * mas_next_range() - Advance the maple state to the next range
+ * @mas: The maple state
+ * @max: The maximum index to check.
+ *
+ * Sets @mas->index and @mas->last to the range.
+ * Must hold rcu_read_lock or the write lock.
+ * Can return the zero entry.
+ *
+ * Return: The next entry or %NULL
+ */
+void *mas_next_range(struct ma_state *mas, unsigned long max)
+{
+ void *entry = NULL;
- /* Retries on dead nodes handled by mas_next_entry */
- return mas_next_entry(mas, max);
+ if (mas_next_setup(mas, max, &entry))
+ return entry;
+
+ /* Retries on dead nodes handled by mas_next_slot */
+ return mas_next_slot(mas, max, true);
}
-EXPORT_SYMBOL_GPL(mas_next);
+EXPORT_SYMBOL_GPL(mas_next_range);
/**
* mt_next() - get the next value in the maple tree
@@ -5955,6 +5920,47 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max)
}
EXPORT_SYMBOL_GPL(mt_next);
+static inline bool mas_prev_setup(struct ma_state *mas, unsigned long min,
+ void **entry)
+{
+ if (mas->index <= min)
+ goto none;
+
+ if (mas_is_none(mas) || mas_is_paused(mas))
+ mas->node = MAS_START;
+
+ if (mas_is_start(mas)) {
+ mas_walk(mas);
+ if (!mas->index)
+ goto none;
+ }
+
+ if (unlikely(mas_is_ptr(mas))) {
+ if (!mas->index)
+ goto none;
+ mas->index = mas->last = 0;
+ *entry = mas_root(mas);
+ return true;
+ }
+
+ if (mas_is_none(mas)) {
+ if (mas->index) {
+ /* Walked to out-of-range pointer? */
+ mas->index = mas->last = 0;
+ mas->node = MAS_ROOT;
+ *entry = mas_root(mas);
+ return true;
+ }
+ return true;
+ }
+
+ return false;
+
+none:
+ mas->node = MAS_NONE;
+ return true;
+}
+
/**
* mas_prev() - Get the previous entry
* @mas: The maple state
@@ -5968,37 +5974,37 @@ EXPORT_SYMBOL_GPL(mt_next);
*/
void *mas_prev(struct ma_state *mas, unsigned long min)
{
- if (!mas->index) {
- /* Nothing comes before 0 */
- mas->last = 0;
- mas->node = MAS_NONE;
- return NULL;
- }
+ void *entry = NULL;
- if (unlikely(mas_is_ptr(mas)))
- return NULL;
+ if (mas_prev_setup(mas, min, &entry))
+ return entry;
- if (mas_is_none(mas) || mas_is_paused(mas))
- mas->node = MAS_START;
+ return mas_prev_slot(mas, min, false);
+}
+EXPORT_SYMBOL_GPL(mas_prev);
- if (mas_is_start(mas)) {
- mas_walk(mas);
- if (!mas->index)
- return NULL;
- }
+/**
+ * mas_prev_range() - Advance to the previous range
+ * @mas: The maple state
+ * @min: The minimum value to check.
+ *
+ * Sets @mas->index and @mas->last to the range.
+ * Must hold rcu_read_lock or the write lock.
+ * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not
+ * searchable nodes.
+ *
+ * Return: the previous value or %NULL.
+ */
+void *mas_prev_range(struct ma_state *mas, unsigned long min)
+{
+ void *entry = NULL;
- if (mas_is_ptr(mas)) {
- if (!mas->index) {
- mas->last = 0;
- return NULL;
- }
+ if (mas_prev_setup(mas, min, &entry))
+ return entry;
- mas->index = mas->last = 0;
- return mas_root_locked(mas);
- }
- return mas_prev_entry(mas, min);
+ return mas_prev_slot(mas, min, true);
}
-EXPORT_SYMBOL_GPL(mas_prev);
+EXPORT_SYMBOL_GPL(mas_prev_range);
/**
* mt_prev() - get the previous value in the maple tree
@@ -6040,6 +6046,64 @@ void mas_pause(struct ma_state *mas)
EXPORT_SYMBOL_GPL(mas_pause);
/**
+ * mas_find_setup() - Internal function to set up mas_find*().
+ * @mas: The maple state
+ * @max: The maximum index
+ * @entry: Pointer to the entry
+ *
+ * Returns: True if entry is the answer, false otherwise.
+ */
+static inline bool mas_find_setup(struct ma_state *mas, unsigned long max,
+ void **entry)
+{
+ *entry = NULL;
+
+ if (unlikely(mas_is_none(mas))) {
+ if (unlikely(mas->last >= max))
+ return true;
+
+ mas->index = mas->last;
+ mas->node = MAS_START;
+ } else if (unlikely(mas_is_paused(mas))) {
+ if (unlikely(mas->last >= max))
+ return true;
+
+ mas->node = MAS_START;
+ mas->index = ++mas->last;
+ } else if (unlikely(mas_is_ptr(mas)))
+ goto ptr_out_of_range;
+
+ if (unlikely(mas_is_start(mas))) {
+ /* First run or continue */
+ if (mas->index > max)
+ return true;
+
+ *entry = mas_walk(mas);
+ if (*entry)
+ return true;
+
+ }
+
+ if (unlikely(!mas_searchable(mas))) {
+ if (unlikely(mas_is_ptr(mas)))
+ goto ptr_out_of_range;
+
+ return true;
+ }
+
+ if (mas->index == max)
+ return true;
+
+ return false;
+
+ptr_out_of_range:
+ mas->node = MAS_NONE;
+ mas->index = 1;
+ mas->last = ULONG_MAX;
+ return true;
+}
+
+/**
* mas_find() - On the first call, find the entry at or after mas->index up to
* %max. Otherwise, find the entry after mas->index.
* @mas: The maple state
@@ -6053,37 +6117,105 @@ EXPORT_SYMBOL_GPL(mas_pause);
*/
void *mas_find(struct ma_state *mas, unsigned long max)
{
+ void *entry = NULL;
+
+ if (mas_find_setup(mas, max, &entry))
+ return entry;
+
+ /* Retries on dead nodes handled by mas_next_slot */
+ return mas_next_slot(mas, max, false);
+}
+EXPORT_SYMBOL_GPL(mas_find);
+
+/**
+ * mas_find_range() - On the first call, find the entry at or after
+ * mas->index up to %max. Otherwise, advance to the next slot mas->index.
+ * @mas: The maple state
+ * @max: The maximum value to check.
+ *
+ * Must hold rcu_read_lock or the write lock.
+ * If an entry exists, last and index are updated accordingly.
+ * May set @mas->node to MAS_NONE.
+ *
+ * Return: The entry or %NULL.
+ */
+void *mas_find_range(struct ma_state *mas, unsigned long max)
+{
+ void *entry;
+
+ if (mas_find_setup(mas, max, &entry))
+ return entry;
+
+ /* Retries on dead nodes handled by mas_next_slot */
+ return mas_next_slot(mas, max, true);
+}
+EXPORT_SYMBOL_GPL(mas_find_range);
+
+/**
+ * mas_find_rev_setup() - Internal function to set up mas_find_*_rev()
+ * @mas: The maple state
+ * @min: The minimum index
+ * @entry: Pointer to the entry
+ *
+ * Returns: True if entry is the answer, false otherwise.
+ */
+static inline bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
+ void **entry)
+{
+ *entry = NULL;
+
+ if (unlikely(mas_is_none(mas))) {
+ if (mas->index <= min)
+ goto none;
+
+ mas->last = mas->index;
+ mas->node = MAS_START;
+ }
+
if (unlikely(mas_is_paused(mas))) {
- if (unlikely(mas->last == ULONG_MAX)) {
+ if (unlikely(mas->index <= min)) {
mas->node = MAS_NONE;
- return NULL;
+ return true;
}
mas->node = MAS_START;
- mas->index = ++mas->last;
+ mas->last = --mas->index;
}
- if (unlikely(mas_is_none(mas)))
- mas->node = MAS_START;
-
if (unlikely(mas_is_start(mas))) {
/* First run or continue */
- void *entry;
+ if (mas->index < min)
+ return true;
- if (mas->index > max)
- return NULL;
+ *entry = mas_walk(mas);
+ if (*entry)
+ return true;
+ }
- entry = mas_walk(mas);
- if (entry)
- return entry;
+ if (unlikely(!mas_searchable(mas))) {
+ if (mas_is_ptr(mas))
+ goto none;
+
+ if (mas_is_none(mas)) {
+ /*
+ * Walked to the location, and there was nothing so the
+ * previous location is 0.
+ */
+ mas->last = mas->index = 0;
+ mas->node = MAS_ROOT;
+ *entry = mas_root(mas);
+ return true;
+ }
}
- if (unlikely(!mas_searchable(mas)))
- return NULL;
+ if (mas->index < min)
+ return true;
- /* Retries on dead nodes handled by mas_next_entry */
- return mas_next_entry(mas, max);
+ return false;
+
+none:
+ mas->node = MAS_NONE;
+ return true;
}
-EXPORT_SYMBOL_GPL(mas_find);
/**
* mas_find_rev: On the first call, find the first non-null entry at or below
@@ -6100,37 +6232,41 @@ EXPORT_SYMBOL_GPL(mas_find);
*/
void *mas_find_rev(struct ma_state *mas, unsigned long min)
{
- if (unlikely(mas_is_paused(mas))) {
- if (unlikely(mas->last == ULONG_MAX)) {
- mas->node = MAS_NONE;
- return NULL;
- }
- mas->node = MAS_START;
- mas->last = --mas->index;
- }
+ void *entry;
- if (unlikely(mas_is_start(mas))) {
- /* First run or continue */
- void *entry;
+ if (mas_find_rev_setup(mas, min, &entry))
+ return entry;
- if (mas->index < min)
- return NULL;
+ /* Retries on dead nodes handled by mas_prev_slot */
+ return mas_prev_slot(mas, min, false);
- entry = mas_walk(mas);
- if (entry)
- return entry;
- }
+}
+EXPORT_SYMBOL_GPL(mas_find_rev);
- if (unlikely(!mas_searchable(mas)))
- return NULL;
+/**
+ * mas_find_range_rev: On the first call, find the first non-null entry at or
+ * below mas->index down to %min. Otherwise advance to the previous slot after
+ * mas->index down to %min.
+ * @mas: The maple state
+ * @min: The minimum value to check.
+ *
+ * Must hold rcu_read_lock or the write lock.
+ * If an entry exists, last and index are updated accordingly.
+ * May set @mas->node to MAS_NONE.
+ *
+ * Return: The entry or %NULL.
+ */
+void *mas_find_range_rev(struct ma_state *mas, unsigned long min)
+{
+ void *entry;
- if (mas->index < min)
- return NULL;
+ if (mas_find_rev_setup(mas, min, &entry))
+ return entry;
- /* Retries on dead nodes handled by mas_prev_entry */
- return mas_prev_entry(mas, min);
+ /* Retries on dead nodes handled by mas_prev_slot */
+ return mas_prev_slot(mas, min, true);
}
-EXPORT_SYMBOL_GPL(mas_find_rev);
+EXPORT_SYMBOL_GPL(mas_find_range_rev);
/**
* mas_erase() - Find the range in which index resides and erase the entire
@@ -6357,7 +6493,7 @@ int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
{
int ret = 0;
- MA_STATE(mas, mt, min, max - size);
+ MA_STATE(mas, mt, min, min);
if (!mt_is_alloc(mt))
return -EINVAL;
@@ -6377,7 +6513,7 @@ int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
retry:
mas.offset = 0;
mas.index = min;
- mas.last = max - size;
+ mas.last = max - size + 1;
ret = mas_alloc(&mas, entry, size, startp);
if (mas_nomem(&mas, gfp))
goto retry;
@@ -6393,14 +6529,14 @@ int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
{
int ret = 0;
- MA_STATE(mas, mt, min, max - size);
+ MA_STATE(mas, mt, min, max - size + 1);
if (!mt_is_alloc(mt))
return -EINVAL;
if (WARN_ON_ONCE(mt_is_reserved(entry)))
return -EINVAL;
- if (min >= max)
+ if (min > max)
return -EINVAL;
if (max < size - 1)
@@ -6512,7 +6648,7 @@ retry:
if (entry)
goto unlock;
- while (mas_searchable(&mas) && (mas.index < max)) {
+ while (mas_searchable(&mas) && (mas.last < max)) {
entry = mas_next_entry(&mas, max);
if (likely(entry && !xa_is_zero(entry)))
break;
@@ -6525,10 +6661,9 @@ unlock:
if (likely(entry)) {
*index = mas.last + 1;
#ifdef CONFIG_DEBUG_MAPLE_TREE
- if ((*index) && (*index) <= copy)
+ if (MT_WARN_ON(mt, (*index) && ((*index) <= copy)))
pr_err("index not increased! %lx <= %lx\n",
*index, copy);
- MT_BUG_ON(mt, (*index) && ((*index) <= copy));
#endif
}
@@ -6674,7 +6809,7 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn,
max = mas->max;
mas->offset = 0;
while (likely(!ma_is_leaf(mt))) {
- MT_BUG_ON(mas->tree, mte_dead_node(mas->node));
+ MAS_WARN_ON(mas, mte_dead_node(mas->node));
slots = ma_slots(mn, mt);
entry = mas_slot(mas, slots, 0);
pivots = ma_pivots(mn, mt);
@@ -6685,7 +6820,7 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn,
mn = mas_mn(mas);
mt = mte_node_type(mas->node);
}
- MT_BUG_ON(mas->tree, mte_dead_node(mas->node));
+ MAS_WARN_ON(mas, mte_dead_node(mas->node));
mas->max = max;
slots = ma_slots(mn, mt);
@@ -6735,15 +6870,12 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
mas->node = mn;
mas_ascend(mas);
- while (mas->node != MAS_NONE) {
+ do {
p = mas->node;
p_min = mas->min;
p_max = mas->max;
mas_prev_node(mas, 0);
- }
-
- if (p == MAS_NONE)
- return;
+ } while (!mas_is_none(mas));
mas->node = p;
mas->max = p_max;
@@ -6752,22 +6884,33 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
/* Tree validations */
static void mt_dump_node(const struct maple_tree *mt, void *entry,
- unsigned long min, unsigned long max, unsigned int depth);
+ unsigned long min, unsigned long max, unsigned int depth,
+ enum mt_dump_format format);
static void mt_dump_range(unsigned long min, unsigned long max,
- unsigned int depth)
+ unsigned int depth, enum mt_dump_format format)
{
static const char spaces[] = " ";
- if (min == max)
- pr_info("%.*s%lu: ", depth * 2, spaces, min);
- else
- pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max);
+ switch(format) {
+ case mt_dump_hex:
+ if (min == max)
+ pr_info("%.*s%lx: ", depth * 2, spaces, min);
+ else
+ pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max);
+ break;
+ default:
+ case mt_dump_dec:
+ if (min == max)
+ pr_info("%.*s%lu: ", depth * 2, spaces, min);
+ else
+ pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max);
+ }
}
static void mt_dump_entry(void *entry, unsigned long min, unsigned long max,
- unsigned int depth)
+ unsigned int depth, enum mt_dump_format format)
{
- mt_dump_range(min, max, depth);
+ mt_dump_range(min, max, depth, format);
if (xa_is_value(entry))
pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry),
@@ -6781,7 +6924,8 @@ static void mt_dump_entry(void *entry, unsigned long min, unsigned long max,
}
static void mt_dump_range64(const struct maple_tree *mt, void *entry,
- unsigned long min, unsigned long max, unsigned int depth)
+ unsigned long min, unsigned long max, unsigned int depth,
+ enum mt_dump_format format)
{
struct maple_range_64 *node = &mte_to_node(entry)->mr64;
bool leaf = mte_is_leaf(entry);
@@ -6789,8 +6933,16 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
int i;
pr_cont(" contents: ");
- for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++)
- pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
+ for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) {
+ switch(format) {
+ case mt_dump_hex:
+ pr_cont("%p %lX ", node->slot[i], node->pivot[i]);
+ break;
+ default:
+ case mt_dump_dec:
+ pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
+ }
+ }
pr_cont("%p\n", node->slot[i]);
for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) {
unsigned long last = max;
@@ -6803,24 +6955,32 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
break;
if (leaf)
mt_dump_entry(mt_slot(mt, node->slot, i),
- first, last, depth + 1);
+ first, last, depth + 1, format);
else if (node->slot[i])
mt_dump_node(mt, mt_slot(mt, node->slot, i),
- first, last, depth + 1);
+ first, last, depth + 1, format);
if (last == max)
break;
if (last > max) {
- pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n",
+ switch(format) {
+ case mt_dump_hex:
+ pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n",
node, last, max, i);
- break;
+ break;
+ default:
+ case mt_dump_dec:
+ pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n",
+ node, last, max, i);
+ }
}
first = last + 1;
}
}
static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
- unsigned long min, unsigned long max, unsigned int depth)
+ unsigned long min, unsigned long max, unsigned int depth,
+ enum mt_dump_format format)
{
struct maple_arange_64 *node = &mte_to_node(entry)->ma64;
bool leaf = mte_is_leaf(entry);
@@ -6845,10 +7005,10 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
break;
if (leaf)
mt_dump_entry(mt_slot(mt, node->slot, i),
- first, last, depth + 1);
+ first, last, depth + 1, format);
else if (node->slot[i])
mt_dump_node(mt, mt_slot(mt, node->slot, i),
- first, last, depth + 1);
+ first, last, depth + 1, format);
if (last == max)
break;
@@ -6862,13 +7022,14 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
}
static void mt_dump_node(const struct maple_tree *mt, void *entry,
- unsigned long min, unsigned long max, unsigned int depth)
+ unsigned long min, unsigned long max, unsigned int depth,
+ enum mt_dump_format format)
{
struct maple_node *node = mte_to_node(entry);
unsigned int type = mte_node_type(entry);
unsigned int i;
- mt_dump_range(min, max, depth);
+ mt_dump_range(min, max, depth, format);
pr_cont("node %p depth %d type %d parent %p", node, depth, type,
node ? node->parent : NULL);
@@ -6879,15 +7040,15 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry,
if (min + i > max)
pr_cont("OUT OF RANGE: ");
mt_dump_entry(mt_slot(mt, node->slot, i),
- min + i, min + i, depth);
+ min + i, min + i, depth, format);
}
break;
case maple_leaf_64:
case maple_range_64:
- mt_dump_range64(mt, entry, min, max, depth);
+ mt_dump_range64(mt, entry, min, max, depth, format);
break;
case maple_arange_64:
- mt_dump_arange64(mt, entry, min, max, depth);
+ mt_dump_arange64(mt, entry, min, max, depth, format);
break;
default:
@@ -6895,16 +7056,16 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry,
}
}
-void mt_dump(const struct maple_tree *mt)
+void mt_dump(const struct maple_tree *mt, enum mt_dump_format format)
{
void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt));
pr_info("maple_tree(%p) flags %X, height %u root %p\n",
mt, mt->ma_flags, mt_height(mt), entry);
if (!xa_is_node(entry))
- mt_dump_entry(entry, 0, 0, 0);
+ mt_dump_entry(entry, 0, 0, 0, format);
else if (entry)
- mt_dump_node(mt, entry, 0, mt_node_max(entry), 0);
+ mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format);
}
EXPORT_SYMBOL_GPL(mt_dump);
@@ -6957,7 +7118,7 @@ static void mas_validate_gaps(struct ma_state *mas)
mas_mn(mas), i,
mas_get_slot(mas, i), gap,
p_end, p_start);
- mt_dump(mas->tree);
+ mt_dump(mas->tree, mt_dump_hex);
MT_BUG_ON(mas->tree,
gap != p_end - p_start + 1);
@@ -6988,27 +7149,29 @@ counted:
p_slot = mte_parent_slot(mas->node);
p_mn = mte_parent(mte);
MT_BUG_ON(mas->tree, max_gap > mas->max);
- if (ma_gaps(p_mn, mas_parent_enum(mas, mte))[p_slot] != max_gap) {
+ if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) {
pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap);
- mt_dump(mas->tree);
+ mt_dump(mas->tree, mt_dump_hex);
}
MT_BUG_ON(mas->tree,
- ma_gaps(p_mn, mas_parent_enum(mas, mte))[p_slot] != max_gap);
+ ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap);
}
static void mas_validate_parent_slot(struct ma_state *mas)
{
struct maple_node *parent;
struct maple_enode *node;
- enum maple_type p_type = mas_parent_enum(mas, mas->node);
- unsigned char p_slot = mte_parent_slot(mas->node);
+ enum maple_type p_type;
+ unsigned char p_slot;
void __rcu **slots;
int i;
if (mte_is_root(mas->node))
return;
+ p_slot = mte_parent_slot(mas->node);
+ p_type = mas_parent_type(mas, mas->node);
parent = mte_parent(mas->node);
slots = ma_slots(parent, p_type);
MT_BUG_ON(mas->tree, mas_mn(mas) == parent);
@@ -7101,18 +7264,18 @@ static void mas_validate_limits(struct ma_state *mas)
if (prev_piv > piv) {
pr_err("%p[%u] piv %lu < prev_piv %lu\n",
mas_mn(mas), i, piv, prev_piv);
- MT_BUG_ON(mas->tree, piv < prev_piv);
+ MAS_WARN_ON(mas, piv < prev_piv);
}
if (piv < mas->min) {
pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i,
piv, mas->min);
- MT_BUG_ON(mas->tree, piv < mas->min);
+ MAS_WARN_ON(mas, piv < mas->min);
}
if (piv > mas->max) {
pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i,
piv, mas->max);
- MT_BUG_ON(mas->tree, piv > mas->max);
+ MAS_WARN_ON(mas, piv > mas->max);
}
prev_piv = piv;
if (piv == mas->max)
@@ -7135,7 +7298,7 @@ static void mas_validate_limits(struct ma_state *mas)
pr_err("%p[%u] should not have piv %lu\n",
mas_mn(mas), i, piv);
- MT_BUG_ON(mas->tree, i < mt_pivots[type] - 1);
+ MAS_WARN_ON(mas, i < mt_pivots[type] - 1);
}
}
}
@@ -7194,16 +7357,15 @@ void mt_validate(struct maple_tree *mt)
mas_first_entry(&mas, mas_mn(&mas), ULONG_MAX, mte_node_type(mas.node));
while (!mas_is_none(&mas)) {
- MT_BUG_ON(mas.tree, mte_dead_node(mas.node));
+ MAS_WARN_ON(&mas, mte_dead_node(mas.node));
if (!mte_is_root(mas.node)) {
end = mas_data_end(&mas);
- if ((end < mt_min_slot_count(mas.node)) &&
- (mas.max != ULONG_MAX)) {
+ if (MAS_WARN_ON(&mas,
+ (end < mt_min_slot_count(mas.node)) &&
+ (mas.max != ULONG_MAX))) {
pr_err("Invalid size %u of %p\n", end,
- mas_mn(&mas));
- MT_BUG_ON(mas.tree, 1);
+ mas_mn(&mas));
}
-
}
mas_validate_parent_slot(&mas);
mas_validate_child_slot(&mas);
@@ -7219,4 +7381,34 @@ done:
}
EXPORT_SYMBOL_GPL(mt_validate);
+void mas_dump(const struct ma_state *mas)
+{
+ pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node);
+ if (mas_is_none(mas))
+ pr_err("(MAS_NONE) ");
+ else if (mas_is_ptr(mas))
+ pr_err("(MAS_ROOT) ");
+ else if (mas_is_start(mas))
+ pr_err("(MAS_START) ");
+ else if (mas_is_paused(mas))
+ pr_err("(MAS_PAUSED) ");
+
+ pr_err("[%u] index=%lx last=%lx\n", mas->offset, mas->index, mas->last);
+ pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n",
+ mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags);
+ if (mas->index > mas->last)
+ pr_err("Check index & last\n");
+}
+EXPORT_SYMBOL_GPL(mas_dump);
+
+void mas_wr_dump(const struct ma_wr_state *wr_mas)
+{
+ pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n",
+ wr_mas->node, wr_mas->r_min, wr_mas->r_max);
+ pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n",
+ wr_mas->type, wr_mas->offset_end, wr_mas->node_end,
+ wr_mas->end_piv);
+}
+EXPORT_SYMBOL_GPL(mas_wr_dump);
+
#endif /* CONFIG_DEBUG_MAPLE_TREE */
diff --git a/lib/show_mem.c b/lib/show_mem.c
deleted file mode 100644
index 1485c87be935..000000000000
--- a/lib/show_mem.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Generic show_mem() implementation
- *
- * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de>
- */
-
-#include <linux/mm.h>
-#include <linux/cma.h>
-
-void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
-{
- unsigned long total = 0, reserved = 0, highmem = 0;
- struct zone *zone;
-
- printk("Mem-Info:\n");
- __show_free_areas(filter, nodemask, max_zone_idx);
-
- for_each_populated_zone(zone) {
-
- total += zone->present_pages;
- reserved += zone->present_pages - zone_managed_pages(zone);
-
- if (is_highmem(zone))
- highmem += zone->present_pages;
- }
-
- printk("%lu pages RAM\n", total);
- printk("%lu pages HighMem/MovableOnly\n", highmem);
- printk("%lu pages reserved\n", reserved);
-#ifdef CONFIG_CMA
- printk("%lu pages cma reserved\n", totalcma_pages);
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
- printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
-#endif
-}
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 2f5aa851834e..7053221ce1d8 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -29,6 +29,7 @@
#include <linux/types.h>
#include <linux/memblock.h>
#include <linux/kasan-enabled.h>
+#include <linux/seq_file.h>
#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
@@ -60,6 +61,7 @@ struct stack_record {
u32 hash; /* Hash in the hash table */
u32 size; /* Number of stored frames */
union handle_parts handle;
+ refcount_t count; /* Number of the same repeated stacks */
unsigned long entries[]; /* Variable-sized array of frames */
};
@@ -305,6 +307,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
stack->handle.valid = 1;
stack->handle.extra = 0;
+ refcount_set(&stack->count, 1);
memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
pool_offset += required_size;
/*
@@ -457,8 +460,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
}
EXPORT_SYMBOL_GPL(stack_depot_save);
-unsigned int stack_depot_fetch(depot_stack_handle_t handle,
- unsigned long **entries)
+static struct stack_record *stack_depot_getstack(depot_stack_handle_t handle)
{
union handle_parts parts = { .handle = handle };
/*
@@ -470,6 +472,100 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
size_t offset = parts.offset << DEPOT_STACK_ALIGN;
struct stack_record *stack;
+ if (!handle)
+ return NULL;
+
+ if (parts.pool_index > pool_index_cached) {
+ WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
+ parts.pool_index, pool_index_cached, handle);
+ return NULL;
+ }
+ pool = stack_pools[parts.pool_index];
+ if (!pool)
+ return NULL;
+ stack = pool + offset;
+ return stack;
+}
+
+#ifdef CONFIG_PAGE_OWNER
+
+extern unsigned long page_owner_stack_threshold;
+
+void *stack_start(struct seq_file *m, loff_t *ppos)
+{
+ unsigned long *table = m->private;
+ struct stack_record **stacks, *stack;
+
+ /* First time */
+ if (*ppos == 0)
+ *table = 0;
+
+ if (*ppos == -1UL)
+ return NULL;
+
+ stacks = &stack_table[*table];
+ stack = (struct stack_record *)stacks;
+
+ return stack;
+}
+
+void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+ unsigned long *table = m->private;
+ unsigned long nr_table = *table;
+ struct stack_record *next = NULL, *stack = v, **stacks;
+ unsigned long stack_table_entries = stack_hash_mask + 1;
+
+ if (!stack) {
+new_table:
+ /* New table */
+ nr_table++;
+ if (nr_table >= stack_table_entries)
+ goto out;
+ stacks = &stack_table[nr_table];
+ stack = (struct stack_record *)stacks;
+ next = stack;
+ } else {
+ next = stack->next;
+ }
+
+ if (!next)
+ goto new_table;
+
+out:
+ *table = nr_table;
+ *ppos = (nr_table >= stack_table_entries) ? -1UL : *ppos + 1;
+ return next;
+}
+
+int stack_print(struct seq_file *m, void *v)
+{
+ char *buf;
+ int ret = 0;
+ struct stack_record *stack = v;
+
+ if (!stack->size || stack->size < 0 ||
+ stack->size > PAGE_SIZE || stack->handle.valid != 1 ||
+ refcount_read(&stack->count) < page_owner_stack_threshold)
+ return 0;
+
+ buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ ret += stack_trace_snprint(buf, PAGE_SIZE, stack->entries, stack->size, 0);
+ scnprintf(buf + ret, PAGE_SIZE - ret, "stack count: %d\n\n",
+ refcount_read(&stack->count));
+ seq_printf(m, buf);
+ seq_puts(m, "\n\n");
+ kfree(buf);
+
+ return 0;
+}
+#endif
+
+unsigned int stack_depot_fetch(depot_stack_handle_t handle,
+ unsigned long **entries)
+{
+ struct stack_record *stack;
+
*entries = NULL;
/*
* Let KMSAN know *entries is initialized. This shall prevent false
@@ -480,21 +576,33 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
if (!handle)
return 0;
- if (parts.pool_index > pool_index_cached) {
- WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
- parts.pool_index, pool_index_cached, handle);
- return 0;
- }
- pool = stack_pools[parts.pool_index];
- if (!pool)
+ stack = stack_depot_getstack(handle);
+ if (!stack)
return 0;
- stack = pool + offset;
*entries = stack->entries;
return stack->size;
}
EXPORT_SYMBOL_GPL(stack_depot_fetch);
+void stack_depot_inc_count(depot_stack_handle_t handle)
+{
+ struct stack_record *stack = NULL;
+
+ stack = stack_depot_getstack(handle);
+ if (stack)
+ refcount_inc(&stack->count);
+}
+
+void stack_depot_dec_count(depot_stack_handle_t handle)
+{
+ struct stack_record *stack = NULL;
+
+ stack = stack_depot_getstack(handle);
+ if (stack)
+ refcount_dec(&stack->count);
+}
+
void stack_depot_print(depot_stack_handle_t stack)
{
unsigned long *entries;
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index f1db333270e9..9939be34e516 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -11,12 +11,33 @@
#include <linux/module.h>
#define MTREE_ALLOC_MAX 0x2000000000000Ul
-#ifndef CONFIG_DEBUG_MAPLE_TREE
-#define CONFIG_DEBUG_MAPLE_TREE
-#endif
#define CONFIG_MAPLE_SEARCH
#define MAPLE_32BIT (MAPLE_NODE_SLOTS > 31)
+#ifndef CONFIG_DEBUG_MAPLE_TREE
+#define mt_dump(mt, fmt) do {} while (0)
+#define mt_validate(mt) do {} while (0)
+#define mt_cache_shrink() do {} while (0)
+#define mas_dump(mas) do {} while (0)
+#define mas_wr_dump(mas) do {} while (0)
+atomic_t maple_tree_tests_run;
+atomic_t maple_tree_tests_passed;
+#undef MT_BUG_ON
+
+#define MT_BUG_ON(__tree, __x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (__x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+#endif
+
/* #define BENCH_SLOT_STORE */
/* #define BENCH_NODE_STORE */
/* #define BENCH_AWALK */
@@ -30,54 +51,54 @@
#else
#define cond_resched() do {} while (0)
#endif
-static
-int mtree_insert_index(struct maple_tree *mt, unsigned long index, gfp_t gfp)
+static int __init mtree_insert_index(struct maple_tree *mt,
+ unsigned long index, gfp_t gfp)
{
return mtree_insert(mt, index, xa_mk_value(index & LONG_MAX), gfp);
}
-static void mtree_erase_index(struct maple_tree *mt, unsigned long index)
+static void __init mtree_erase_index(struct maple_tree *mt, unsigned long index)
{
MT_BUG_ON(mt, mtree_erase(mt, index) != xa_mk_value(index & LONG_MAX));
MT_BUG_ON(mt, mtree_load(mt, index) != NULL);
}
-static int mtree_test_insert(struct maple_tree *mt, unsigned long index,
+static int __init mtree_test_insert(struct maple_tree *mt, unsigned long index,
void *ptr)
{
return mtree_insert(mt, index, ptr, GFP_KERNEL);
}
-static int mtree_test_store_range(struct maple_tree *mt, unsigned long start,
- unsigned long end, void *ptr)
+static int __init mtree_test_store_range(struct maple_tree *mt,
+ unsigned long start, unsigned long end, void *ptr)
{
return mtree_store_range(mt, start, end, ptr, GFP_KERNEL);
}
-static int mtree_test_store(struct maple_tree *mt, unsigned long start,
+static int __init mtree_test_store(struct maple_tree *mt, unsigned long start,
void *ptr)
{
return mtree_test_store_range(mt, start, start, ptr);
}
-static int mtree_test_insert_range(struct maple_tree *mt, unsigned long start,
- unsigned long end, void *ptr)
+static int __init mtree_test_insert_range(struct maple_tree *mt,
+ unsigned long start, unsigned long end, void *ptr)
{
return mtree_insert_range(mt, start, end, ptr, GFP_KERNEL);
}
-static void *mtree_test_load(struct maple_tree *mt, unsigned long index)
+static void __init *mtree_test_load(struct maple_tree *mt, unsigned long index)
{
return mtree_load(mt, index);
}
-static void *mtree_test_erase(struct maple_tree *mt, unsigned long index)
+static void __init *mtree_test_erase(struct maple_tree *mt, unsigned long index)
{
return mtree_erase(mt, index);
}
#if defined(CONFIG_64BIT)
-static noinline void check_mtree_alloc_range(struct maple_tree *mt,
+static noinline void __init check_mtree_alloc_range(struct maple_tree *mt,
unsigned long start, unsigned long end, unsigned long size,
unsigned long expected, int eret, void *ptr)
{
@@ -94,7 +115,7 @@ static noinline void check_mtree_alloc_range(struct maple_tree *mt,
MT_BUG_ON(mt, result != expected);
}
-static noinline void check_mtree_alloc_rrange(struct maple_tree *mt,
+static noinline void __init check_mtree_alloc_rrange(struct maple_tree *mt,
unsigned long start, unsigned long end, unsigned long size,
unsigned long expected, int eret, void *ptr)
{
@@ -102,7 +123,7 @@ static noinline void check_mtree_alloc_rrange(struct maple_tree *mt,
unsigned long result = expected + 1;
int ret;
- ret = mtree_alloc_rrange(mt, &result, ptr, size, start, end - 1,
+ ret = mtree_alloc_rrange(mt, &result, ptr, size, start, end,
GFP_KERNEL);
MT_BUG_ON(mt, ret != eret);
if (ret)
@@ -112,8 +133,8 @@ static noinline void check_mtree_alloc_rrange(struct maple_tree *mt,
}
#endif
-static noinline void check_load(struct maple_tree *mt, unsigned long index,
- void *ptr)
+static noinline void __init check_load(struct maple_tree *mt,
+ unsigned long index, void *ptr)
{
void *ret = mtree_test_load(mt, index);
@@ -122,7 +143,7 @@ static noinline void check_load(struct maple_tree *mt, unsigned long index,
MT_BUG_ON(mt, ret != ptr);
}
-static noinline void check_store_range(struct maple_tree *mt,
+static noinline void __init check_store_range(struct maple_tree *mt,
unsigned long start, unsigned long end, void *ptr, int expected)
{
int ret = -EINVAL;
@@ -138,7 +159,7 @@ static noinline void check_store_range(struct maple_tree *mt,
check_load(mt, i, ptr);
}
-static noinline void check_insert_range(struct maple_tree *mt,
+static noinline void __init check_insert_range(struct maple_tree *mt,
unsigned long start, unsigned long end, void *ptr, int expected)
{
int ret = -EINVAL;
@@ -154,8 +175,8 @@ static noinline void check_insert_range(struct maple_tree *mt,
check_load(mt, i, ptr);
}
-static noinline void check_insert(struct maple_tree *mt, unsigned long index,
- void *ptr)
+static noinline void __init check_insert(struct maple_tree *mt,
+ unsigned long index, void *ptr)
{
int ret = -EINVAL;
@@ -163,7 +184,7 @@ static noinline void check_insert(struct maple_tree *mt, unsigned long index,
MT_BUG_ON(mt, ret != 0);
}
-static noinline void check_dup_insert(struct maple_tree *mt,
+static noinline void __init check_dup_insert(struct maple_tree *mt,
unsigned long index, void *ptr)
{
int ret = -EINVAL;
@@ -173,13 +194,13 @@ static noinline void check_dup_insert(struct maple_tree *mt,
}
-static noinline
-void check_index_load(struct maple_tree *mt, unsigned long index)
+static noinline void __init check_index_load(struct maple_tree *mt,
+ unsigned long index)
{
return check_load(mt, index, xa_mk_value(index & LONG_MAX));
}
-static inline int not_empty(struct maple_node *node)
+static inline __init int not_empty(struct maple_node *node)
{
int i;
@@ -194,8 +215,8 @@ static inline int not_empty(struct maple_node *node)
}
-static noinline void check_rev_seq(struct maple_tree *mt, unsigned long max,
- bool verbose)
+static noinline void __init check_rev_seq(struct maple_tree *mt,
+ unsigned long max, bool verbose)
{
unsigned long i = max, j;
@@ -219,7 +240,7 @@ static noinline void check_rev_seq(struct maple_tree *mt, unsigned long max,
#ifndef __KERNEL__
if (verbose) {
rcu_barrier();
- mt_dump(mt);
+ mt_dump(mt, mt_dump_dec);
pr_info(" %s test of 0-%lu %luK in %d active (%d total)\n",
__func__, max, mt_get_alloc_size()/1024, mt_nr_allocated(),
mt_nr_tallocated());
@@ -227,7 +248,7 @@ static noinline void check_rev_seq(struct maple_tree *mt, unsigned long max,
#endif
}
-static noinline void check_seq(struct maple_tree *mt, unsigned long max,
+static noinline void __init check_seq(struct maple_tree *mt, unsigned long max,
bool verbose)
{
unsigned long i, j;
@@ -248,7 +269,7 @@ static noinline void check_seq(struct maple_tree *mt, unsigned long max,
#ifndef __KERNEL__
if (verbose) {
rcu_barrier();
- mt_dump(mt);
+ mt_dump(mt, mt_dump_dec);
pr_info(" seq test of 0-%lu %luK in %d active (%d total)\n",
max, mt_get_alloc_size()/1024, mt_nr_allocated(),
mt_nr_tallocated());
@@ -256,7 +277,7 @@ static noinline void check_seq(struct maple_tree *mt, unsigned long max,
#endif
}
-static noinline void check_lb_not_empty(struct maple_tree *mt)
+static noinline void __init check_lb_not_empty(struct maple_tree *mt)
{
unsigned long i, j;
unsigned long huge = 4000UL * 1000 * 1000;
@@ -275,13 +296,13 @@ static noinline void check_lb_not_empty(struct maple_tree *mt)
mtree_destroy(mt);
}
-static noinline void check_lower_bound_split(struct maple_tree *mt)
+static noinline void __init check_lower_bound_split(struct maple_tree *mt)
{
MT_BUG_ON(mt, !mtree_empty(mt));
check_lb_not_empty(mt);
}
-static noinline void check_upper_bound_split(struct maple_tree *mt)
+static noinline void __init check_upper_bound_split(struct maple_tree *mt)
{
unsigned long i, j;
unsigned long huge;
@@ -306,7 +327,7 @@ static noinline void check_upper_bound_split(struct maple_tree *mt)
mtree_destroy(mt);
}
-static noinline void check_mid_split(struct maple_tree *mt)
+static noinline void __init check_mid_split(struct maple_tree *mt)
{
unsigned long huge = 8000UL * 1000 * 1000;
@@ -315,7 +336,7 @@ static noinline void check_mid_split(struct maple_tree *mt)
check_lb_not_empty(mt);
}
-static noinline void check_rev_find(struct maple_tree *mt)
+static noinline void __init check_rev_find(struct maple_tree *mt)
{
int i, nr_entries = 200;
void *val;
@@ -354,7 +375,7 @@ static noinline void check_rev_find(struct maple_tree *mt)
rcu_read_unlock();
}
-static noinline void check_find(struct maple_tree *mt)
+static noinline void __init check_find(struct maple_tree *mt)
{
unsigned long val = 0;
unsigned long count;
@@ -571,7 +592,7 @@ static noinline void check_find(struct maple_tree *mt)
mtree_destroy(mt);
}
-static noinline void check_find_2(struct maple_tree *mt)
+static noinline void __init check_find_2(struct maple_tree *mt)
{
unsigned long i, j;
void *entry;
@@ -616,7 +637,7 @@ static noinline void check_find_2(struct maple_tree *mt)
#if defined(CONFIG_64BIT)
-static noinline void check_alloc_rev_range(struct maple_tree *mt)
+static noinline void __init check_alloc_rev_range(struct maple_tree *mt)
{
/*
* Generated by:
@@ -624,7 +645,7 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt)
* awk -F "-" '{printf "0x%s, 0x%s, ", $1, $2}'
*/
- unsigned long range[] = {
+ static const unsigned long range[] = {
/* Inclusive , Exclusive. */
0x565234af2000, 0x565234af4000,
0x565234af4000, 0x565234af9000,
@@ -652,7 +673,7 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt)
0x7fff58791000, 0x7fff58793000,
};
- unsigned long holes[] = {
+ static const unsigned long holes[] = {
/*
* Note: start of hole is INCLUSIVE
* end of hole is EXCLUSIVE
@@ -672,7 +693,7 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt)
* 4. number that should be returned.
* 5. return value
*/
- unsigned long req_range[] = {
+ static const unsigned long req_range[] = {
0x565234af9000, /* Min */
0x7fff58791000, /* Max */
0x1000, /* Size */
@@ -680,7 +701,7 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt)
0, /* Return value success. */
0x0, /* Min */
- 0x565234AF1 << 12, /* Max */
+ 0x565234AF0 << 12, /* Max */
0x3000, /* Size */
0x565234AEE << 12, /* max - 3. */
0, /* Return value success. */
@@ -692,14 +713,14 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt)
0, /* Return value success. */
0x0, /* Min */
- 0x7F36D510A << 12, /* Max */
+ 0x7F36D5109 << 12, /* Max */
0x4000, /* Size */
0x7F36D5106 << 12, /* First rev hole of size 0x4000 */
0, /* Return value success. */
/* Ascend test. */
0x0,
- 34148798629 << 12,
+ 34148798628 << 12,
19 << 12,
34148797418 << 12,
0x0,
@@ -711,6 +732,12 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt)
0x0,
-EBUSY,
+ /* Single space test. */
+ 34148798725 << 12,
+ 34148798725 << 12,
+ 1 << 12,
+ 34148798725 << 12,
+ 0,
};
int i, range_count = ARRAY_SIZE(range);
@@ -759,9 +786,9 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt)
mas_unlock(&mas);
for (i = 0; i < req_range_count; i += 5) {
#if DEBUG_REV_RANGE
- pr_debug("\tReverse request between %lu-%lu size %lu, should get %lu\n",
- req_range[i] >> 12,
- (req_range[i + 1] >> 12) - 1,
+ pr_debug("\tReverse request %d between %lu-%lu size %lu, should get %lu\n",
+ i, req_range[i] >> 12,
+ (req_range[i + 1] >> 12),
req_range[i+2] >> 12,
req_range[i+3] >> 12);
#endif
@@ -777,13 +804,14 @@ static noinline void check_alloc_rev_range(struct maple_tree *mt)
mt_set_non_kernel(1);
mtree_erase(mt, 34148798727); /* create a deleted range. */
+ mtree_erase(mt, 34148798725);
check_mtree_alloc_rrange(mt, 0, 34359052173, 210253414,
34148798725, 0, mt);
mtree_destroy(mt);
}
-static noinline void check_alloc_range(struct maple_tree *mt)
+static noinline void __init check_alloc_range(struct maple_tree *mt)
{
/*
* Generated by:
@@ -791,7 +819,7 @@ static noinline void check_alloc_range(struct maple_tree *mt)
* awk -F "-" '{printf "0x%s, 0x%s, ", $1, $2}'
*/
- unsigned long range[] = {
+ static const unsigned long range[] = {
/* Inclusive , Exclusive. */
0x565234af2000, 0x565234af4000,
0x565234af4000, 0x565234af9000,
@@ -818,7 +846,7 @@ static noinline void check_alloc_range(struct maple_tree *mt)
0x7fff5878e000, 0x7fff58791000,
0x7fff58791000, 0x7fff58793000,
};
- unsigned long holes[] = {
+ static const unsigned long holes[] = {
/* Start of hole, end of hole, size of hole (+1) */
0x565234afb000, 0x565234afc000, 0x1000,
0x565234afe000, 0x565235def000, 0x12F1000,
@@ -833,7 +861,7 @@ static noinline void check_alloc_range(struct maple_tree *mt)
* 4. number that should be returned.
* 5. return value
*/
- unsigned long req_range[] = {
+ static const unsigned long req_range[] = {
0x565234af9000, /* Min */
0x7fff58791000, /* Max */
0x1000, /* Size */
@@ -880,6 +908,13 @@ static noinline void check_alloc_range(struct maple_tree *mt)
4503599618982063UL << 12, /* Size */
34359052178 << 12, /* Expected location */
-EBUSY, /* Return failure. */
+
+ /* Test a single entry */
+ 34148798648 << 12, /* Min */
+ 34148798648 << 12, /* Max */
+ 4096, /* Size of 1 */
+ 34148798648 << 12, /* Location is the same as min/max */
+ 0, /* Success */
};
int i, range_count = ARRAY_SIZE(range);
int req_range_count = ARRAY_SIZE(req_range);
@@ -893,7 +928,7 @@ static noinline void check_alloc_range(struct maple_tree *mt)
#if DEBUG_ALLOC_RANGE
pr_debug("\tInsert %lu-%lu\n", range[i] >> 12,
(range[i + 1] >> 12) - 1);
- mt_dump(mt);
+ mt_dump(mt, mt_dump_hex);
#endif
check_insert_range(mt, range[i] >> 12, (range[i + 1] >> 12) - 1,
xa_mk_value(range[i] >> 12), 0);
@@ -934,7 +969,7 @@ static noinline void check_alloc_range(struct maple_tree *mt)
xa_mk_value(req_range[i] >> 12)); /* pointer */
mt_validate(mt);
#if DEBUG_ALLOC_RANGE
- mt_dump(mt);
+ mt_dump(mt, mt_dump_hex);
#endif
}
@@ -942,10 +977,10 @@ static noinline void check_alloc_range(struct maple_tree *mt)
}
#endif
-static noinline void check_ranges(struct maple_tree *mt)
+static noinline void __init check_ranges(struct maple_tree *mt)
{
int i, val, val2;
- unsigned long r[] = {
+ static const unsigned long r[] = {
10, 15,
20, 25,
17, 22, /* Overlaps previous range. */
@@ -1210,7 +1245,7 @@ static noinline void check_ranges(struct maple_tree *mt)
MT_BUG_ON(mt, mt_height(mt) != 4);
}
-static noinline void check_next_entry(struct maple_tree *mt)
+static noinline void __init check_next_entry(struct maple_tree *mt)
{
void *entry = NULL;
unsigned long limit = 30, i = 0;
@@ -1234,7 +1269,7 @@ static noinline void check_next_entry(struct maple_tree *mt)
mtree_destroy(mt);
}
-static noinline void check_prev_entry(struct maple_tree *mt)
+static noinline void __init check_prev_entry(struct maple_tree *mt)
{
unsigned long index = 16;
void *value;
@@ -1278,7 +1313,7 @@ static noinline void check_prev_entry(struct maple_tree *mt)
mas_unlock(&mas);
}
-static noinline void check_root_expand(struct maple_tree *mt)
+static noinline void __init check_root_expand(struct maple_tree *mt)
{
MA_STATE(mas, mt, 0, 0);
void *ptr;
@@ -1287,6 +1322,7 @@ static noinline void check_root_expand(struct maple_tree *mt)
mas_lock(&mas);
mas_set(&mas, 3);
ptr = mas_walk(&mas);
+ MT_BUG_ON(mt, mas.index != 0);
MT_BUG_ON(mt, ptr != NULL);
MT_BUG_ON(mt, mas.index != 0);
MT_BUG_ON(mt, mas.last != ULONG_MAX);
@@ -1356,7 +1392,7 @@ static noinline void check_root_expand(struct maple_tree *mt)
mas_store_gfp(&mas, ptr, GFP_KERNEL);
ptr = mas_next(&mas, ULONG_MAX);
MT_BUG_ON(mt, ptr != NULL);
- MT_BUG_ON(mt, (mas.index != 1) && (mas.last != ULONG_MAX));
+ MT_BUG_ON(mt, (mas.index != ULONG_MAX) && (mas.last != ULONG_MAX));
mas_set(&mas, 1);
ptr = mas_prev(&mas, 0);
@@ -1367,13 +1403,13 @@ static noinline void check_root_expand(struct maple_tree *mt)
mas_unlock(&mas);
}
-static noinline void check_gap_combining(struct maple_tree *mt)
+static noinline void __init check_gap_combining(struct maple_tree *mt)
{
struct maple_enode *mn1, *mn2;
void *entry;
unsigned long singletons = 100;
- unsigned long *seq100;
- unsigned long seq100_64[] = {
+ static const unsigned long *seq100;
+ static const unsigned long seq100_64[] = {
/* 0-5 */
74, 75, 76,
50, 100, 2,
@@ -1387,7 +1423,7 @@ static noinline void check_gap_combining(struct maple_tree *mt)
76, 2, 79, 85, 4,
};
- unsigned long seq100_32[] = {
+ static const unsigned long seq100_32[] = {
/* 0-5 */
61, 62, 63,
50, 100, 2,
@@ -1401,11 +1437,11 @@ static noinline void check_gap_combining(struct maple_tree *mt)
76, 2, 79, 85, 4,
};
- unsigned long seq2000[] = {
+ static const unsigned long seq2000[] = {
1152, 1151,
1100, 1200, 2,
};
- unsigned long seq400[] = {
+ static const unsigned long seq400[] = {
286, 318,
256, 260, 266, 270, 275, 280, 290, 398,
286, 310,
@@ -1564,7 +1600,7 @@ static noinline void check_gap_combining(struct maple_tree *mt)
mt_set_non_kernel(0);
mtree_destroy(mt);
}
-static noinline void check_node_overwrite(struct maple_tree *mt)
+static noinline void __init check_node_overwrite(struct maple_tree *mt)
{
int i, max = 4000;
@@ -1572,12 +1608,12 @@ static noinline void check_node_overwrite(struct maple_tree *mt)
mtree_test_store_range(mt, i*100, i*100 + 50, xa_mk_value(i*100));
mtree_test_store_range(mt, 319951, 367950, NULL);
- /*mt_dump(mt); */
+ /*mt_dump(mt, mt_dump_dec); */
mt_validate(mt);
}
#if defined(BENCH_SLOT_STORE)
-static noinline void bench_slot_store(struct maple_tree *mt)
+static noinline void __init bench_slot_store(struct maple_tree *mt)
{
int i, brk = 105, max = 1040, brk_start = 100, count = 20000000;
@@ -1593,7 +1629,7 @@ static noinline void bench_slot_store(struct maple_tree *mt)
#endif
#if defined(BENCH_NODE_STORE)
-static noinline void bench_node_store(struct maple_tree *mt)
+static noinline void __init bench_node_store(struct maple_tree *mt)
{
int i, overwrite = 76, max = 240, count = 20000000;
@@ -1612,7 +1648,7 @@ static noinline void bench_node_store(struct maple_tree *mt)
#endif
#if defined(BENCH_AWALK)
-static noinline void bench_awalk(struct maple_tree *mt)
+static noinline void __init bench_awalk(struct maple_tree *mt)
{
int i, max = 2500, count = 50000000;
MA_STATE(mas, mt, 1470, 1470);
@@ -1629,7 +1665,7 @@ static noinline void bench_awalk(struct maple_tree *mt)
}
#endif
#if defined(BENCH_WALK)
-static noinline void bench_walk(struct maple_tree *mt)
+static noinline void __init bench_walk(struct maple_tree *mt)
{
int i, max = 2500, count = 550000000;
MA_STATE(mas, mt, 1470, 1470);
@@ -1646,7 +1682,7 @@ static noinline void bench_walk(struct maple_tree *mt)
#endif
#if defined(BENCH_MT_FOR_EACH)
-static noinline void bench_mt_for_each(struct maple_tree *mt)
+static noinline void __init bench_mt_for_each(struct maple_tree *mt)
{
int i, count = 1000000;
unsigned long max = 2500, index = 0;
@@ -1670,7 +1706,7 @@ static noinline void bench_mt_for_each(struct maple_tree *mt)
#endif
/* check_forking - simulate the kernel forking sequence with the tree. */
-static noinline void check_forking(struct maple_tree *mt)
+static noinline void __init check_forking(struct maple_tree *mt)
{
struct maple_tree newmt;
@@ -1709,7 +1745,7 @@ static noinline void check_forking(struct maple_tree *mt)
mtree_destroy(&newmt);
}
-static noinline void check_iteration(struct maple_tree *mt)
+static noinline void __init check_iteration(struct maple_tree *mt)
{
int i, nr_entries = 125;
void *val;
@@ -1765,7 +1801,6 @@ static noinline void check_iteration(struct maple_tree *mt)
mas.index = 760;
mas.last = 765;
mas_store(&mas, val);
- mas_next(&mas, ULONG_MAX);
}
i++;
}
@@ -1777,7 +1812,7 @@ static noinline void check_iteration(struct maple_tree *mt)
mt_set_non_kernel(0);
}
-static noinline void check_mas_store_gfp(struct maple_tree *mt)
+static noinline void __init check_mas_store_gfp(struct maple_tree *mt)
{
struct maple_tree newmt;
@@ -1810,7 +1845,7 @@ static noinline void check_mas_store_gfp(struct maple_tree *mt)
}
#if defined(BENCH_FORK)
-static noinline void bench_forking(struct maple_tree *mt)
+static noinline void __init bench_forking(struct maple_tree *mt)
{
struct maple_tree newmt;
@@ -1852,15 +1887,17 @@ static noinline void bench_forking(struct maple_tree *mt)
}
#endif
-static noinline void next_prev_test(struct maple_tree *mt)
+static noinline void __init next_prev_test(struct maple_tree *mt)
{
int i, nr_entries;
void *val;
MA_STATE(mas, mt, 0, 0);
struct maple_enode *mn;
- unsigned long *level2;
- unsigned long level2_64[] = {707, 1000, 710, 715, 720, 725};
- unsigned long level2_32[] = {1747, 2000, 1750, 1755, 1760, 1765};
+ static const unsigned long *level2;
+ static const unsigned long level2_64[] = { 707, 1000, 710, 715, 720,
+ 725};
+ static const unsigned long level2_32[] = { 1747, 2000, 1750, 1755,
+ 1760, 1765};
if (MAPLE_32BIT) {
nr_entries = 500;
@@ -1974,7 +2011,7 @@ static noinline void next_prev_test(struct maple_tree *mt)
val = mas_next(&mas, ULONG_MAX);
MT_BUG_ON(mt, val != NULL);
- MT_BUG_ON(mt, mas.index != ULONG_MAX);
+ MT_BUG_ON(mt, mas.index != 0x7d6);
MT_BUG_ON(mt, mas.last != ULONG_MAX);
val = mas_prev(&mas, 0);
@@ -1998,7 +2035,8 @@ static noinline void next_prev_test(struct maple_tree *mt)
val = mas_prev(&mas, 0);
MT_BUG_ON(mt, val != NULL);
MT_BUG_ON(mt, mas.index != 0);
- MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.last != 5);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
mas.index = 0;
mas.last = 5;
@@ -2010,7 +2048,7 @@ static noinline void next_prev_test(struct maple_tree *mt)
val = mas_prev(&mas, 0);
MT_BUG_ON(mt, val != NULL);
MT_BUG_ON(mt, mas.index != 0);
- MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.last != 9);
mas_unlock(&mas);
mtree_destroy(mt);
@@ -2028,7 +2066,7 @@ static noinline void next_prev_test(struct maple_tree *mt)
/* Test spanning writes that require balancing right sibling or right cousin */
-static noinline void check_spanning_relatives(struct maple_tree *mt)
+static noinline void __init check_spanning_relatives(struct maple_tree *mt)
{
unsigned long i, nr_entries = 1000;
@@ -2041,7 +2079,7 @@ static noinline void check_spanning_relatives(struct maple_tree *mt)
mtree_store_range(mt, 9365, 9955, NULL, GFP_KERNEL);
}
-static noinline void check_fuzzer(struct maple_tree *mt)
+static noinline void __init check_fuzzer(struct maple_tree *mt)
{
/*
* 1. Causes a spanning rebalance of a single root node.
@@ -2438,7 +2476,7 @@ static noinline void check_fuzzer(struct maple_tree *mt)
}
/* duplicate the tree with a specific gap */
-static noinline void check_dup_gaps(struct maple_tree *mt,
+static noinline void __init check_dup_gaps(struct maple_tree *mt,
unsigned long nr_entries, bool zero_start,
unsigned long gap)
{
@@ -2478,7 +2516,7 @@ static noinline void check_dup_gaps(struct maple_tree *mt,
}
/* Duplicate many sizes of trees. Mainly to test expected entry values */
-static noinline void check_dup(struct maple_tree *mt)
+static noinline void __init check_dup(struct maple_tree *mt)
{
int i;
int big_start = 100010;
@@ -2566,7 +2604,7 @@ static noinline void check_dup(struct maple_tree *mt)
}
}
-static noinline void check_bnode_min_spanning(struct maple_tree *mt)
+static noinline void __init check_bnode_min_spanning(struct maple_tree *mt)
{
int i = 50;
MA_STATE(mas, mt, 0, 0);
@@ -2585,7 +2623,7 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt)
mt_set_non_kernel(0);
}
-static noinline void check_empty_area_window(struct maple_tree *mt)
+static noinline void __init check_empty_area_window(struct maple_tree *mt)
{
unsigned long i, nr_entries = 20;
MA_STATE(mas, mt, 0, 0);
@@ -2660,7 +2698,7 @@ static noinline void check_empty_area_window(struct maple_tree *mt)
MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY);
mas_reset(&mas);
- MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY);
+ MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EINVAL);
mas_reset(&mas);
mas_empty_area(&mas, 100, 165, 3);
@@ -2670,7 +2708,7 @@ static noinline void check_empty_area_window(struct maple_tree *mt)
rcu_read_unlock();
}
-static noinline void check_empty_area_fill(struct maple_tree *mt)
+static noinline void __init check_empty_area_fill(struct maple_tree *mt)
{
const unsigned long max = 0x25D78000;
unsigned long size;
@@ -2713,12 +2751,635 @@ static noinline void check_empty_area_fill(struct maple_tree *mt)
mt_set_non_kernel(0);
}
+/*
+ * Check MAS_START, MAS_PAUSE, active (implied), and MAS_NONE transitions.
+ *
+ * The table below shows the single entry tree (0-0 pointer) and normal tree
+ * with nodes.
+ *
+ * Function ENTRY Start Result index & last
+ * ┬ ┬ ┬ ┬ ┬
+ * │ │ │ │ └─ the final range
+ * │ │ │ └─ The node value after execution
+ * │ │ └─ The node value before execution
+ * │ └─ If the entry exists or does not exists (DNE)
+ * └─ The function name
+ *
+ * Function ENTRY Start Result index & last
+ * mas_next()
+ * - after last
+ * Single entry tree at 0-0
+ * ------------------------
+ * DNE MAS_START MAS_NONE 1 - oo
+ * DNE MAS_PAUSE MAS_NONE 1 - oo
+ * DNE MAS_ROOT MAS_NONE 1 - oo
+ * when index = 0
+ * DNE MAS_NONE MAS_ROOT 0
+ * when index > 0
+ * DNE MAS_NONE MAS_NONE 1 - oo
+ *
+ * Normal tree
+ * -----------
+ * exists MAS_START active range
+ * DNE MAS_START active set to last range
+ * exists MAS_PAUSE active range
+ * DNE MAS_PAUSE active set to last range
+ * exists MAS_NONE active range
+ * exists active active range
+ * DNE active active set to last range
+ *
+ * Function ENTRY Start Result index & last
+ * mas_prev()
+ * - before index
+ * Single entry tree at 0-0
+ * ------------------------
+ * if index > 0
+ * exists MAS_START MAS_ROOT 0
+ * exists MAS_PAUSE MAS_ROOT 0
+ * exists MAS_NONE MAS_ROOT 0
+ *
+ * if index == 0
+ * DNE MAS_START MAS_NONE 0
+ * DNE MAS_PAUSE MAS_NONE 0
+ * DNE MAS_NONE MAS_NONE 0
+ * DNE MAS_ROOT MAS_NONE 0
+ *
+ * Normal tree
+ * -----------
+ * exists MAS_START active range
+ * DNE MAS_START active set to min
+ * exists MAS_PAUSE active range
+ * DNE MAS_PAUSE active set to min
+ * exists MAS_NONE active range
+ * DNE MAS_NONE MAS_NONE set to min
+ * any MAS_ROOT MAS_NONE 0
+ * exists active active range
+ * DNE active active last range
+ *
+ * Function ENTRY Start Result index & last
+ * mas_find()
+ * - at index or next
+ * Single entry tree at 0-0
+ * ------------------------
+ * if index > 0
+ * DNE MAS_START MAS_NONE 0
+ * DNE MAS_PAUSE MAS_NONE 0
+ * DNE MAS_ROOT MAS_NONE 0
+ * DNE MAS_NONE MAS_NONE 0
+ * if index == 0
+ * exists MAS_START MAS_ROOT 0
+ * exists MAS_PAUSE MAS_ROOT 0
+ * exists MAS_NONE MAS_ROOT 0
+ *
+ * Normal tree
+ * -----------
+ * exists MAS_START active range
+ * DNE MAS_START active set to max
+ * exists MAS_PAUSE active range
+ * DNE MAS_PAUSE active set to max
+ * exists MAS_NONE active range
+ * exists active active range
+ * DNE active active last range (max < last)
+ *
+ * Function ENTRY Start Result index & last
+ * mas_find_rev()
+ * - at index or before
+ * Single entry tree at 0-0
+ * ------------------------
+ * if index > 0
+ * exists MAS_START MAS_ROOT 0
+ * exists MAS_PAUSE MAS_ROOT 0
+ * exists MAS_NONE MAS_ROOT 0
+ * if index == 0
+ * DNE MAS_START MAS_NONE 0
+ * DNE MAS_PAUSE MAS_NONE 0
+ * DNE MAS_NONE MAS_NONE 0
+ * DNE MAS_ROOT MAS_NONE 0
+ *
+ * Normal tree
+ * -----------
+ * exists MAS_START active range
+ * DNE MAS_START active set to min
+ * exists MAS_PAUSE active range
+ * DNE MAS_PAUSE active set to min
+ * exists MAS_NONE active range
+ * exists active active range
+ * DNE active active last range (min > index)
+ *
+ * Function ENTRY Start Result index & last
+ * mas_walk()
+ * - Look up index
+ * Single entry tree at 0-0
+ * ------------------------
+ * if index > 0
+ * DNE MAS_START MAS_ROOT 1 - oo
+ * DNE MAS_PAUSE MAS_ROOT 1 - oo
+ * DNE MAS_NONE MAS_ROOT 1 - oo
+ * DNE MAS_ROOT MAS_ROOT 1 - oo
+ * if index == 0
+ * exists MAS_START MAS_ROOT 0
+ * exists MAS_PAUSE MAS_ROOT 0
+ * exists MAS_NONE MAS_ROOT 0
+ * exists MAS_ROOT MAS_ROOT 0
+ *
+ * Normal tree
+ * -----------
+ * exists MAS_START active range
+ * DNE MAS_START active range of NULL
+ * exists MAS_PAUSE active range
+ * DNE MAS_PAUSE active range of NULL
+ * exists MAS_NONE active range
+ * DNE MAS_NONE active range of NULL
+ * exists active active range
+ * DNE active active range of NULL
+ */
+
+#define mas_active(x) (((x).node != MAS_ROOT) && \
+ ((x).node != MAS_START) && \
+ ((x).node != MAS_PAUSE) && \
+ ((x).node != MAS_NONE))
+static noinline void __init check_state_handling(struct maple_tree *mt)
+{
+ MA_STATE(mas, mt, 0, 0);
+ void *entry, *ptr = (void *) 0x1234500;
+ void *ptr2 = &ptr;
+ void *ptr3 = &ptr2;
+
+ /* Check MAS_ROOT First */
+ mtree_store_range(mt, 0, 0, ptr, GFP_KERNEL);
+
+ mas_lock(&mas);
+ /* prev: Start -> none */
+ entry = mas_prev(&mas, 0);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* prev: Start -> root */
+ mas_set(&mas, 10);
+ entry = mas_prev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* prev: pause -> root */
+ mas_set(&mas, 10);
+ mas_pause(&mas);
+ entry = mas_prev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* next: start -> none */
+ mas_set(&mas, 0);
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* next: start -> none */
+ mas_set(&mas, 10);
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* find: start -> root */
+ mas_set(&mas, 0);
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* find: root -> none */
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* find: none -> none */
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* find: start -> none */
+ mas_set(&mas, 10);
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* find_rev: none -> root */
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* find_rev: start -> root */
+ mas_set(&mas, 0);
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* find_rev: root -> none */
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* find_rev: none -> none */
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* find_rev: start -> root */
+ mas_set(&mas, 10);
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* walk: start -> none */
+ mas_set(&mas, 10);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* walk: pause -> none*/
+ mas_set(&mas, 10);
+ mas_pause(&mas);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* walk: none -> none */
+ mas.index = mas.last = 10;
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* walk: none -> none */
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* walk: start -> root */
+ mas_set(&mas, 0);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* walk: pause -> root */
+ mas_set(&mas, 0);
+ mas_pause(&mas);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* walk: none -> root */
+ mas.node = MAS_NONE;
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* walk: root -> root */
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ /* walk: root -> none */
+ mas_set(&mas, 10);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 1);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, mas.node != MAS_NONE);
+
+ /* walk: none -> root */
+ mas.index = mas.last = 0;
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0);
+ MT_BUG_ON(mt, mas.node != MAS_ROOT);
+
+ mas_unlock(&mas);
+
+ /* Check when there is an actual node */
+ mtree_store_range(mt, 0, 0, NULL, GFP_KERNEL);
+ mtree_store_range(mt, 0x1000, 0x1500, ptr, GFP_KERNEL);
+ mtree_store_range(mt, 0x2000, 0x2500, ptr2, GFP_KERNEL);
+ mtree_store_range(mt, 0x3000, 0x3500, ptr3, GFP_KERNEL);
+
+ mas_lock(&mas);
+
+ /* next: start ->active */
+ mas_set(&mas, 0);
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* next: pause ->active */
+ mas_set(&mas, 0);
+ mas_pause(&mas);
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* next: none ->active */
+ mas.index = mas.last = 0;
+ mas.offset = 0;
+ mas.node = MAS_NONE;
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* next:active ->active */
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr2);
+ MT_BUG_ON(mt, mas.index != 0x2000);
+ MT_BUG_ON(mt, mas.last != 0x2500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* next:active -> active out of range*/
+ entry = mas_next(&mas, 0x2999);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x2501);
+ MT_BUG_ON(mt, mas.last != 0x2fff);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* Continue after out of range*/
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr3);
+ MT_BUG_ON(mt, mas.index != 0x3000);
+ MT_BUG_ON(mt, mas.last != 0x3500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* next:active -> active out of range*/
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x3501);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* next: none -> active, skip value at location */
+ mas_set(&mas, 0);
+ entry = mas_next(&mas, ULONG_MAX);
+ mas.node = MAS_NONE;
+ mas.offset = 0;
+ entry = mas_next(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr2);
+ MT_BUG_ON(mt, mas.index != 0x2000);
+ MT_BUG_ON(mt, mas.last != 0x2500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* prev:active ->active */
+ entry = mas_prev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* prev:active -> active out of range*/
+ entry = mas_prev(&mas, 0);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0x0FFF);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* prev: pause ->active */
+ mas_set(&mas, 0x3600);
+ entry = mas_prev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr3);
+ mas_pause(&mas);
+ entry = mas_prev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr2);
+ MT_BUG_ON(mt, mas.index != 0x2000);
+ MT_BUG_ON(mt, mas.last != 0x2500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* prev:active -> active out of range*/
+ entry = mas_prev(&mas, 0x1600);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x1501);
+ MT_BUG_ON(mt, mas.last != 0x1FFF);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* prev: active ->active, continue*/
+ entry = mas_prev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find: start ->active */
+ mas_set(&mas, 0);
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find: pause ->active */
+ mas_set(&mas, 0);
+ mas_pause(&mas);
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find: start ->active on value */;
+ mas_set(&mas, 1200);
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find:active ->active */
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != ptr2);
+ MT_BUG_ON(mt, mas.index != 0x2000);
+ MT_BUG_ON(mt, mas.last != 0x2500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+
+ /* find:active -> active (NULL)*/
+ entry = mas_find(&mas, 0x2700);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x2501);
+ MT_BUG_ON(mt, mas.last != 0x2FFF);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find: none ->active */
+ entry = mas_find(&mas, 0x5000);
+ MT_BUG_ON(mt, entry != ptr3);
+ MT_BUG_ON(mt, mas.index != 0x3000);
+ MT_BUG_ON(mt, mas.last != 0x3500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find:active -> active (NULL) end*/
+ entry = mas_find(&mas, ULONG_MAX);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x3501);
+ MT_BUG_ON(mt, mas.last != ULONG_MAX);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find_rev: active (END) ->active */
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr3);
+ MT_BUG_ON(mt, mas.index != 0x3000);
+ MT_BUG_ON(mt, mas.last != 0x3500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find_rev:active ->active */
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr2);
+ MT_BUG_ON(mt, mas.index != 0x2000);
+ MT_BUG_ON(mt, mas.last != 0x2500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find_rev: pause ->active */
+ mas_pause(&mas);
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find_rev:active -> active */
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0);
+ MT_BUG_ON(mt, mas.last != 0x0FFF);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* find_rev: start ->active */
+ mas_set(&mas, 0x1200);
+ entry = mas_find_rev(&mas, 0);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* mas_walk start ->active */
+ mas_set(&mas, 0x1200);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* mas_walk start ->active */
+ mas_set(&mas, 0x1600);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x1501);
+ MT_BUG_ON(mt, mas.last != 0x1fff);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* mas_walk pause ->active */
+ mas_set(&mas, 0x1200);
+ mas_pause(&mas);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* mas_walk pause -> active */
+ mas_set(&mas, 0x1600);
+ mas_pause(&mas);
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x1501);
+ MT_BUG_ON(mt, mas.last != 0x1fff);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* mas_walk none -> active */
+ mas_set(&mas, 0x1200);
+ mas.node = MAS_NONE;
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* mas_walk none -> active */
+ mas_set(&mas, 0x1600);
+ mas.node = MAS_NONE;
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x1501);
+ MT_BUG_ON(mt, mas.last != 0x1fff);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* mas_walk active -> active */
+ mas.index = 0x1200;
+ mas.last = 0x1200;
+ mas.offset = 0;
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != ptr);
+ MT_BUG_ON(mt, mas.index != 0x1000);
+ MT_BUG_ON(mt, mas.last != 0x1500);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ /* mas_walk active -> active */
+ mas.index = 0x1600;
+ mas.last = 0x1600;
+ entry = mas_walk(&mas);
+ MT_BUG_ON(mt, entry != NULL);
+ MT_BUG_ON(mt, mas.index != 0x1501);
+ MT_BUG_ON(mt, mas.last != 0x1fff);
+ MT_BUG_ON(mt, !mas_active(mas));
+
+ mas_unlock(&mas);
+}
+
static DEFINE_MTREE(tree);
-static int maple_tree_seed(void)
+static int __init maple_tree_seed(void)
{
- unsigned long set[] = {5015, 5014, 5017, 25, 1000,
- 1001, 1002, 1003, 1005, 0,
- 5003, 5002};
+ unsigned long set[] = { 5015, 5014, 5017, 25, 1000,
+ 1001, 1002, 1003, 1005, 0,
+ 5003, 5002};
void *ptr = &set;
pr_info("\nTEST STARTING\n\n");
@@ -2974,6 +3635,10 @@ static int maple_tree_seed(void)
mtree_destroy(&tree);
+ mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
+ check_state_handling(&tree);
+ mtree_destroy(&tree);
+
#if defined(BENCH)
skip:
#endif
@@ -2988,7 +3653,7 @@ skip:
return -EINVAL;
}
-static void maple_tree_harvest(void)
+static void __exit maple_tree_harvest(void)
{
}
diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
index f06df065dec0..2c34e8a33a1c 100644
--- a/lib/zstd/common/zstd_deps.h
+++ b/lib/zstd/common/zstd_deps.h
@@ -105,21 +105,3 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
#endif /* ZSTD_DEPS_IO */
#endif /* ZSTD_DEPS_NEED_IO */
-
-/*
- * Only requested when MSAN is enabled.
- * Need:
- * intptr_t
- */
-#ifdef ZSTD_DEPS_NEED_STDINT
-#ifndef ZSTD_DEPS_STDINT
-#define ZSTD_DEPS_STDINT
-
-/*
- * The Linux Kernel doesn't provide intptr_t, only uintptr_t, which
- * is an unsigned long.
- */
-typedef long intptr_t;
-
-#endif /* ZSTD_DEPS_STDINT */
-#endif /* ZSTD_DEPS_NEED_STDINT */
diff --git a/mm/Makefile b/mm/Makefile
index e29afc890cde..678530a07326 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o \
+ compaction.o show_mem.o\
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
@@ -89,6 +89,7 @@ obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_KFENCE) += kfence/
obj-$(CONFIG_KMSAN) += kmsan/
obj-$(CONFIG_FAILSLAB) += failslab.o
+obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
@@ -123,6 +124,7 @@ obj-$(CONFIG_SECRETMEM) += secretmem.o
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
+obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_DAMON) += damon/
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/compaction.c b/mm/compaction.c
index c8bcdea15f5f..2c71fe27bc7b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -392,18 +392,14 @@ void reset_isolation_suitable(pg_data_t *pgdat)
* Sets the pageblock skip bit if it was clear. Note that this is a hint as
* locks are not required for read/writers. Returns true if it was already set.
*/
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
- unsigned long pfn)
+static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
bool skip;
- /* Do no update if skip hint is being ignored */
+ /* Do not update if skip hint is being ignored */
if (cc->ignore_skip_hint)
return false;
- if (!pageblock_aligned(pfn))
- return false;
-
skip = get_pageblock_skip(page);
if (!skip && !cc->no_set_skip_hint)
set_pageblock_skip(page);
@@ -470,8 +466,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
{
}
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
- unsigned long pfn)
+static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
return false;
}
@@ -765,6 +760,34 @@ static bool too_many_isolated(pg_data_t *pgdat)
return too_many;
}
+/*
+ * Check if this base page should be skipped from isolation because
+ * it has extra refcounts that will prevent it from being migrated.
+ * This function is called for regular pages only, and not
+ * for THP or hugetlbfs pages. This code is inspired by similar code
+ * in migrate_vma_check_page(), can_split_folio() and
+ * folio_migrate_mapping()
+ */
+static inline bool page_has_extrarefs(struct page *page,
+ struct address_space *mapping)
+{
+ unsigned long extra_refs;
+
+ /* anonymous page can have extra ref from swap cache */
+ if (mapping)
+ extra_refs = 1 + PagePrivate(page);
+ else
+ extra_refs = PageSwapCache(page) ? 1 : 0;
+
+ /*
+ * This is an admittedly racy check but good enough to determine
+ * if a page is pinned and can not be migrated
+ */
+ if ((page_ref_count(page) - extra_refs) > page_mapcount(page))
+ return true;
+ return false;
+}
+
/**
* isolate_migratepages_block() - isolate all migrate-able pages within
* a single pageblock
@@ -1003,12 +1026,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_fail;
/*
- * Migration will fail if an anonymous page is pinned in memory,
- * so avoid taking lru_lock and isolating it unnecessarily in an
- * admittedly racy check.
+ * Migration will fail if a page has extra refcounts
+ * preventing it from migrating, so avoid taking
+ * lru_lock and isolating it unnecessarily
*/
mapping = page_mapping(page);
- if (!mapping && (page_count(page) - 1) > total_mapcount(page))
+ if (page_has_extrarefs(page, mapping))
goto isolate_fail_put;
/*
@@ -1075,9 +1098,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
lruvec_memcg_debug(lruvec, page_folio(page));
/* Try get exclusive access under lock */
- if (!skip_updated) {
+ if (!skip_updated && valid_page) {
skip_updated = true;
- if (test_and_set_skip(cc, page, low_pfn))
+ if (test_and_set_skip(cc, valid_page))
goto isolate_abort;
}
@@ -1736,6 +1759,7 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE
*/
static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
static int sysctl_extfrag_threshold = 500;
+static int __read_mostly sysctl_compact_memory;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -1864,7 +1888,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
- set_pageblock_skip(freepage);
break;
}
}
@@ -2456,7 +2479,8 @@ rescan:
}
/*
* If an ASYNC or SYNC_LIGHT fails to migrate a page
- * within the current order-aligned block, scan the
+ * within the current order-aligned block and
+ * fast_find_migrateblock may be used then scan the
* remainder of the pageblock. This will mark the
* pageblock "skip" to avoid rescanning in the near
* future. This will isolate more pages than necessary
@@ -2464,8 +2488,9 @@ rescan:
* fast_find_migrateblock revisiting blocks that were
* recently partially scanned.
*/
- if (cc->direct_compaction && !cc->finish_pageblock &&
- (cc->mode < MIGRATE_SYNC)) {
+ if (!pageblock_aligned(cc->migrate_pfn) &&
+ !cc->ignore_skip_hint && !cc->finish_pageblock &&
+ (cc->mode < MIGRATE_SYNC)) {
cc->finish_pageblock = true;
/*
@@ -2780,6 +2805,15 @@ static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int
static int sysctl_compaction_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, length, ppos);
+ if (ret)
+ return ret;
+
+ if (sysctl_compact_memory != 1)
+ return -EINVAL;
+
if (write)
compact_nodes();
@@ -3095,7 +3129,7 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
static struct ctl_table vm_compaction[] = {
{
.procname = "compact_memory",
- .data = NULL,
+ .data = &sysctl_compact_memory,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = sysctl_compaction_handler,
diff --git a/mm/debug.c b/mm/debug.c
index c7b228097bd9..ee533a5ceb79 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -268,4 +268,13 @@ void page_init_poison(struct page *page, size_t size)
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
+
+void vma_iter_dump_tree(const struct vma_iterator *vmi)
+{
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ mas_dump(&vmi->mas);
+ mt_dump(vmi->mas.tree, mt_dump_hex);
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
+}
+
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
new file mode 100644
index 000000000000..f9d145730fd1
--- /dev/null
+++ b/mm/debug_page_alloc.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+
+unsigned int _debug_guardpage_minorder;
+
+bool _debug_pagealloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
+
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
+
+static int __init early_debug_pagealloc(char *buf)
+{
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ pr_err("Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
+
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype)
+{
+ if (order >= debug_guardpage_minorder())
+ return false;
+
+ __SetPageGuard(page);
+ INIT_LIST_HEAD(&page->buddy_list);
+ set_page_private(page, order);
+ /* Guard pages are not available for any usage */
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+ return true;
+}
+
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype)
+{
+ __ClearPageGuard(page);
+
+ set_page_private(page, 0);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, (1 << order), migratetype);
+}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index d2b0f8fc9649..a151a21e571b 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -226,7 +226,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
{
struct dma_pool *retval;
size_t allocation;
- bool empty = false;
+ bool empty;
if (!dev)
return NULL;
@@ -276,8 +276,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
*/
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
- if (list_empty(&dev->dma_pools))
- empty = true;
+ empty = list_empty(&dev->dma_pools);
list_add(&retval->pools, &dev->dma_pools);
mutex_unlock(&pools_lock);
if (empty) {
@@ -361,7 +360,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
void dma_pool_destroy(struct dma_pool *pool)
{
struct dma_page *page, *tmp;
- bool empty = false, busy = false;
+ bool empty, busy = false;
if (unlikely(!pool))
return;
@@ -369,8 +368,7 @@ void dma_pool_destroy(struct dma_pool *pool)
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
- if (list_empty(&pool->dev->dma_pools))
- empty = true;
+ empty = list_empty(&pool->dev->dma_pools);
mutex_unlock(&pools_lock);
if (empty)
device_remove_file(pool->dev, &dev_attr_pools);
diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c
new file mode 100644
index 000000000000..b1b09cce9394
--- /dev/null
+++ b/mm/fail_page_alloc.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fault-inject.h>
+#include <linux/mm.h>
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_gfp_highmem;
+ bool ignore_gfp_reclaim;
+ u32 min_order;
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_reclaim = true,
+ .ignore_gfp_highmem = true,
+ .min_order = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ int flags = 0;
+
+ if (order < fail_page_alloc.min_order)
+ return false;
+ if (gfp_mask & __GFP_NOFAIL)
+ return false;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return false;
+ if (fail_page_alloc.ignore_gfp_reclaim &&
+ (gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* See comment in __should_failslab() */
+ if (gfp_mask & __GFP_NOWARN)
+ flags |= FAULT_NOWARN;
+
+ return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+ &fail_page_alloc.attr);
+
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
+
+ return 0;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/filemap.c b/mm/filemap.c
index b4c9bd368b7e..570bc8c3db87 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
@@ -58,6 +59,8 @@
#include <asm/mman.h>
+#include "swap.h"
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -1625,36 +1628,6 @@ void folio_end_writeback(struct folio *folio)
}
EXPORT_SYMBOL(folio_end_writeback);
-/*
- * After completing I/O on a page, call this routine to update the page
- * flags appropriately
- */
-void page_endio(struct page *page, bool is_write, int err)
-{
- struct folio *folio = page_folio(page);
-
- if (!is_write) {
- if (!err) {
- folio_mark_uptodate(folio);
- } else {
- folio_clear_uptodate(folio);
- folio_set_error(folio);
- }
- folio_unlock(folio);
- } else {
- if (err) {
- struct address_space *mapping;
-
- folio_set_error(folio);
- mapping = folio_mapping(folio);
- if (mapping)
- mapping_set_error(mapping, err);
- }
- folio_end_writeback(folio);
- }
-}
-EXPORT_SYMBOL_GPL(page_endio);
-
/**
* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
* @folio: The folio to lock
@@ -4119,3 +4092,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);
+
+#ifdef CONFIG_CACHESTAT_SYSCALL
+/**
+ * filemap_cachestat() - compute the page cache statistics of a mapping
+ * @mapping: The mapping to compute the statistics for.
+ * @first_index: The starting page cache index.
+ * @last_index: The final page index (inclusive).
+ * @cs: the cachestat struct to write the result to.
+ *
+ * This will query the page cache statistics of a mapping in the
+ * page range of [first_index, last_index] (inclusive). The statistics
+ * queried include: number of dirty pages, number of pages marked for
+ * writeback, and the number of (recently) evicted pages.
+ */
+static void filemap_cachestat(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
+{
+ XA_STATE(xas, &mapping->i_pages, first_index);
+ struct folio *folio;
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_index) {
+ unsigned long nr_pages;
+ pgoff_t folio_first_index, folio_last_index;
+
+ if (xas_retry(&xas, folio))
+ continue;
+
+ if (xa_is_value(folio)) {
+ /* page is evicted */
+ void *shadow = (void *)folio;
+ bool workingset; /* not used */
+ int order = xa_get_order(xas.xa, xas.xa_index);
+
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ cs->nr_evicted += nr_pages;
+
+#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
+ if (shmem_mapping(mapping)) {
+ /* shmem file - in swap cache */
+ swp_entry_t swp = radix_to_swp_entry(folio);
+
+ shadow = get_shadow_from_swap_cache(swp);
+ }
+#endif
+ if (workingset_test_recent(shadow, true, &workingset))
+ cs->nr_recently_evicted += nr_pages;
+
+ goto resched;
+ }
+
+ nr_pages = folio_nr_pages(folio);
+ folio_first_index = folio_pgoff(folio);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ /* page is in cache */
+ cs->nr_cache += nr_pages;
+
+ if (folio_test_dirty(folio))
+ cs->nr_dirty += nr_pages;
+
+ if (folio_test_writeback(folio))
+ cs->nr_writeback += nr_pages;
+
+resched:
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * The cachestat(2) system call.
+ *
+ * cachestat() returns the page cache statistics of a file in the
+ * bytes range specified by `off` and `len`: number of cached pages,
+ * number of dirty pages, number of pages marked for writeback,
+ * number of evicted pages, and number of recently evicted pages.
+ *
+ * An evicted page is a page that is previously in the page cache
+ * but has been evicted since. A page is recently evicted if its last
+ * eviction was recent enough that its reentry to the cache would
+ * indicate that it is actively being used by the system, and that
+ * there is memory pressure on the system.
+ *
+ * `off` and `len` must be non-negative integers. If `len` > 0,
+ * the queried range is [`off`, `off` + `len`]. If `len` == 0,
+ * we will query in the range from `off` to the end of the file.
+ *
+ * The `flags` argument is unused for now, but is included for future
+ * extensibility. User should pass 0 (i.e no flag specified).
+ *
+ * Currently, hugetlbfs is not supported.
+ *
+ * Because the status of a page can change after cachestat() checks it
+ * but before it returns to the application, the returned values may
+ * contain stale information.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - cstat or cstat_range points to an illegal address
+ * -EINVAL - invalid flags
+ * -EBADF - invalid file descriptor
+ * -EOPNOTSUPP - file descriptor is of a hugetlbfs file
+ */
+SYSCALL_DEFINE4(cachestat, unsigned int, fd,
+ struct cachestat_range __user *, cstat_range,
+ struct cachestat __user *, cstat, unsigned int, flags)
+{
+ struct fd f = fdget(fd);
+ struct address_space *mapping;
+ struct cachestat_range csr;
+ struct cachestat cs;
+ pgoff_t first_index, last_index;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (copy_from_user(&csr, cstat_range,
+ sizeof(struct cachestat_range))) {
+ fdput(f);
+ return -EFAULT;
+ }
+
+ /* hugetlbfs is not supported */
+ if (is_file_hugepages(f.file)) {
+ fdput(f);
+ return -EOPNOTSUPP;
+ }
+
+ if (flags != 0) {
+ fdput(f);
+ return -EINVAL;
+ }
+
+ first_index = csr.off >> PAGE_SHIFT;
+ last_index =
+ csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
+ memset(&cs, 0, sizeof(struct cachestat));
+ mapping = f.file->f_mapping;
+ filemap_cachestat(mapping, first_index, last_index, &cs);
+ fdput(f);
+
+ if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
+ return -EFAULT;
+
+ return 0;
+}
+#endif /* CONFIG_CACHESTAT_SYSCALL */
diff --git a/mm/gup.c b/mm/gup.c
index bbe416236593..dbe96d266670 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -18,6 +18,7 @@
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
+#include <linux/shmem_fs.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -959,16 +960,54 @@ static int faultin_page(struct vm_area_struct *vma,
return 0;
}
+/*
+ * Writing to file-backed mappings which require folio dirty tracking using GUP
+ * is a fundamentally broken operation, as kernel write access to GUP mappings
+ * do not adhere to the semantics expected by a file system.
+ *
+ * Consider the following scenario:-
+ *
+ * 1. A folio is written to via GUP which write-faults the memory, notifying
+ * the file system and dirtying the folio.
+ * 2. Later, writeback is triggered, resulting in the folio being cleaned and
+ * the PTE being marked read-only.
+ * 3. The GUP caller writes to the folio, as it is mapped read/write via the
+ * direct mapping.
+ * 4. The GUP caller, now done with the page, unpins it and sets it dirty
+ * (though it does not have to).
+ *
+ * This results in both data being written to a folio without writenotify, and
+ * the folio being dirtied unexpectedly (if the caller decides to do so).
+ */
+static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
+ unsigned long gup_flags)
+{
+ /*
+ * If we aren't pinning then no problematic write can occur. A long term
+ * pin is the most egregious case so this is the case we disallow.
+ */
+ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
+ (FOLL_PIN | FOLL_LONGTERM))
+ return true;
+
+ /*
+ * If the VMA does not require dirty tracking then no problematic write
+ * can occur either.
+ */
+ return !vma_needs_dirty_tracking(vma);
+}
+
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
vm_flags_t vm_flags = vma->vm_flags;
int write = (gup_flags & FOLL_WRITE);
int foreign = (gup_flags & FOLL_REMOTE);
+ bool vma_anon = vma_is_anonymous(vma);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
- if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ if ((gup_flags & FOLL_ANON) && !vma_anon)
return -EFAULT;
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
@@ -978,6 +1017,10 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
return -EFAULT;
if (write) {
+ if (!vma_anon &&
+ !writable_file_mapping_allowed(vma, gup_flags))
+ return -EFAULT;
+
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
@@ -1024,8 +1067,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: whether we're still with the mmap_lock held
*
* Returns either number of pages pinned (which may be less than the
@@ -1039,8 +1080,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_lock is held.
- *
* Must be called with mmap_lock held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
@@ -1076,7 +1115,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
@@ -1116,9 +1155,9 @@ static long __get_user_pages(struct mm_struct *mm,
goto out;
if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i,
- gup_flags, locked);
+ i = follow_hugetlb_page(mm, vma, pages,
+ &start, &nr_pages, i,
+ gup_flags, locked);
if (!*locked) {
/*
* We've got a VM_FAULT_RETRY
@@ -1183,10 +1222,6 @@ retry:
ctx.page_mask = 0;
}
next_page:
- if (vmas) {
- vmas[i] = vma;
- ctx.page_mask = 0;
- }
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
@@ -1341,7 +1376,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas,
int *locked,
unsigned int flags)
{
@@ -1379,7 +1413,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
pages_done = 0;
for (;;) {
ret = __get_user_pages(mm, start, nr_pages, flags, pages,
- vmas, locked);
+ locked);
if (!(flags & FOLL_UNLOCKABLE)) {
/* VM_FAULT_RETRY couldn't trigger, bypass */
pages_done = ret;
@@ -1443,7 +1477,7 @@ retry:
*locked = 1;
ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
- pages, NULL, locked);
+ pages, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
BUG_ON(ret != 0);
@@ -1541,7 +1575,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* not result in a stack expansion that recurses back here.
*/
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked ? locked : &local_locked);
+ NULL, locked ? locked : &local_locked);
lru_add_drain();
return ret;
}
@@ -1599,7 +1633,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
return -EINVAL;
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked);
+ NULL, locked);
lru_add_drain();
return ret;
}
@@ -1667,8 +1701,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
- struct vm_area_struct **vmas, int *locked,
- unsigned int foll_flags)
+ int *locked, unsigned int foll_flags)
{
struct vm_area_struct *vma;
bool must_unlock = false;
@@ -1712,8 +1745,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
if (pages[i])
get_page(pages[i]);
}
- if (vmas)
- vmas[i] = vma;
+
start = (start + PAGE_SIZE) & PAGE_MASK;
}
@@ -1894,8 +1926,7 @@ struct page *get_dump_page(unsigned long addr)
int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL,
- &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
@@ -2068,7 +2099,6 @@ static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas,
int *locked,
unsigned int gup_flags)
{
@@ -2076,13 +2106,13 @@ static long __gup_longterm_locked(struct mm_struct *mm,
long rc, nr_pinned_pages;
if (!(gup_flags & FOLL_LONGTERM))
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages,
locked, gup_flags);
flags = memalloc_pin_save();
do {
nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
- pages, vmas, locked,
+ pages, locked,
gup_flags);
if (nr_pinned_pages <= 0) {
rc = nr_pinned_pages;
@@ -2100,9 +2130,8 @@ static long __gup_longterm_locked(struct mm_struct *mm,
* Check that the given flags are valid for the exported gup/pup interface, and
* update them with the required flags that the caller must have set.
*/
-static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
- int *locked, unsigned int *gup_flags_p,
- unsigned int to_set)
+static bool is_valid_gup_args(struct page **pages, int *locked,
+ unsigned int *gup_flags_p, unsigned int to_set)
{
unsigned int gup_flags = *gup_flags_p;
@@ -2144,13 +2173,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
(gup_flags & FOLL_PCI_P2PDMA)))
return false;
- /*
- * Can't use VMAs with locked, as locked allows GUP to unlock
- * which invalidates the vmas array
- */
- if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE)))
- return false;
-
*gup_flags_p = gup_flags;
return true;
}
@@ -2165,8 +2187,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
@@ -2181,8 +2201,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_lock is held.
- *
* Must be called with mmap_lock held for read or write.
*
* get_user_pages_remote walks a process's page tables and takes a reference
@@ -2219,15 +2237,15 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
int local_locked = 1;
- if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
+ if (!is_valid_gup_args(pages, locked, &gup_flags,
FOLL_TOUCH | FOLL_REMOTE))
return -EINVAL;
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages,
locked ? locked : &local_locked,
gup_flags);
}
@@ -2237,7 +2255,7 @@ EXPORT_SYMBOL(get_user_pages_remote);
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
return 0;
}
@@ -2251,8 +2269,6 @@ long get_user_pages_remote(struct mm_struct *mm,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
*
* This is the same as get_user_pages_remote(), just with a less-flexible
* calling convention where we assume that the mm being operated on belongs to
@@ -2260,16 +2276,15 @@ long get_user_pages_remote(struct mm_struct *mm,
* obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ unsigned int gup_flags, struct page **pages)
{
int locked = 1;
- if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
return -EINVAL;
return __get_user_pages_locked(current->mm, start, nr_pages, pages,
- vmas, &locked, gup_flags);
+ &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages);
@@ -2293,12 +2308,12 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
{
int locked = 0;
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_TOUCH | FOLL_UNLOCKABLE))
return -EINVAL;
return __get_user_pages_locked(current->mm, start, nr_pages, pages,
- NULL, &locked, gup_flags);
+ &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -2337,6 +2352,82 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*/
#ifdef CONFIG_HAVE_FAST_GUP
+/*
+ * Used in the GUP-fast path to determine whether a pin is permitted for a
+ * specific folio.
+ *
+ * This call assumes the caller has pinned the folio, that the lowest page table
+ * level still points to this folio, and that interrupts have been disabled.
+ *
+ * Writing to pinned file-backed dirty tracked folios is inherently problematic
+ * (see comment describing the writable_file_mapping_allowed() function). We
+ * therefore try to avoid the most egregious case of a long-term mapping doing
+ * so.
+ *
+ * This function cannot be as thorough as that one as the VMA is not available
+ * in the fast path, so instead we whitelist known good cases and if in doubt,
+ * fall back to the slow path.
+ */
+static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
+{
+ struct address_space *mapping;
+ unsigned long mapping_flags;
+
+ /*
+ * If we aren't pinning then no problematic write can occur. A long term
+ * pin is the most egregious case so this is the one we disallow.
+ */
+ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) !=
+ (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
+ return true;
+
+ /* The folio is pinned, so we can safely access folio fields. */
+
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return false;
+
+ /* hugetlb mappings do not require dirty-tracking. */
+ if (folio_test_hugetlb(folio))
+ return true;
+
+ /*
+ * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
+ * cannot proceed, which means no actions performed under RCU can
+ * proceed either.
+ *
+ * inodes and thus their mappings are freed under RCU, which means the
+ * mapping cannot be freed beneath us and thus we can safely dereference
+ * it.
+ */
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * However, there may be operations which _alter_ the mapping, so ensure
+ * we read it once and only once.
+ */
+ mapping = READ_ONCE(folio->mapping);
+
+ /*
+ * The mapping may have been truncated, in any case we cannot determine
+ * if this mapping is safe - fall back to slow path to determine how to
+ * proceed.
+ */
+ if (!mapping)
+ return false;
+
+ /* Anonymous folios pose no problem. */
+ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
+ if (mapping_flags)
+ return mapping_flags & PAGE_MAPPING_ANON;
+
+ /*
+ * At this point, we know the mapping is non-null and points to an
+ * address_space object. The only remaining whitelisted file system is
+ * shmem.
+ */
+ return shmem_mapping(mapping);
+}
+
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
unsigned int flags,
struct page **pages)
@@ -2422,6 +2513,11 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
goto pte_unmap;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
+ }
+
if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
@@ -2614,6 +2710,11 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
return 0;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
@@ -2680,6 +2781,10 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
@@ -2720,6 +2825,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
@@ -2755,6 +2865,16 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -2983,7 +3103,7 @@ static int internal_get_user_pages_fast(unsigned long start,
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
- pages, NULL, &locked,
+ pages, &locked,
gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
if (ret < 0) {
/*
@@ -3025,7 +3145,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages,
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
*/
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_GET | FOLL_FAST_ONLY))
return -EINVAL;
@@ -3058,7 +3178,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
* FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
* request.
*/
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -3083,7 +3203,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -3098,8 +3218,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
@@ -3114,14 +3232,14 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
int local_locked = 1;
- if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
+ if (!is_valid_gup_args(pages, locked, &gup_flags,
FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
return 0;
- return __gup_longterm_locked(mm, start, nr_pages, pages, vmas,
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
locked ? locked : &local_locked,
gup_flags);
}
@@ -3135,8 +3253,6 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
*
* Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
* FOLL_PIN is set.
@@ -3145,15 +3261,14 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* see Documentation/core-api/pin_user_pages.rst for details.
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ unsigned int gup_flags, struct page **pages)
{
int locked = 1;
- if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
return 0;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, &locked, gup_flags);
+ pages, &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
@@ -3167,11 +3282,11 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
{
int locked = 0;
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
return 0;
- return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL,
+ return __gup_longterm_locked(current->mm, start, nr_pages, pages,
&locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
diff --git a/mm/gup_test.c b/mm/gup_test.c
index 8ae7307a1bb6..1668ce0e0783 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -139,29 +139,27 @@ static int __gup_test_ioctl(unsigned int cmd,
pages + i);
break;
case GUP_BASIC_TEST:
- nr = get_user_pages(addr, nr, gup->gup_flags, pages + i,
- NULL);
+ nr = get_user_pages(addr, nr, gup->gup_flags, pages + i);
break;
case PIN_FAST_BENCHMARK:
nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
pages + i);
break;
case PIN_BASIC_TEST:
- nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i,
- NULL);
+ nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i);
break;
case PIN_LONGTERM_BENCHMARK:
nr = pin_user_pages(addr, nr,
gup->gup_flags | FOLL_LONGTERM,
- pages + i, NULL);
+ pages + i);
break;
case DUMP_USER_PAGES_TEST:
if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
nr = pin_user_pages(addr, nr, gup->gup_flags,
- pages + i, NULL);
+ pages + i);
else
nr = get_user_pages(addr, nr, gup->gup_flags,
- pages + i, NULL);
+ pages + i);
break;
default:
ret = -EINVAL;
@@ -271,7 +269,7 @@ static inline int pin_longterm_test_start(unsigned long arg)
gup_flags, pages);
else
cur_pages = pin_user_pages(addr, remaining_pages,
- gup_flags, pages, NULL);
+ gup_flags, pages);
if (cur_pages < 0) {
pin_longterm_test_stop();
ret = cur_pages;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f154019e6b84..ea24718db4af 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6425,17 +6425,14 @@ out_release_nounlock:
}
#endif /* CONFIG_USERFAULTFD */
-static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
- int refs, struct page **pages,
- struct vm_area_struct **vmas)
+static void record_subpages(struct page *page, struct vm_area_struct *vma,
+ int refs, struct page **pages)
{
int nr;
for (nr = 0; nr < refs; nr++) {
if (likely(pages))
pages[nr] = nth_page(page, nr);
- if (vmas)
- vmas[nr] = vma;
}
}
@@ -6508,9 +6505,9 @@ out_unlock:
}
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *locked)
+ struct page **pages, unsigned long *position,
+ unsigned long *nr_pages, long i, unsigned int flags,
+ int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -6638,7 +6635,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
- if (!pages && !vmas && !pfn_offset &&
+ if (!pages && !pfn_offset &&
(vaddr + huge_page_size(h) < vma->vm_end) &&
(remainder >= pages_per_huge_page(h))) {
vaddr += huge_page_size(h);
@@ -6653,11 +6650,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
(vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
- if (pages || vmas)
- record_subpages_vmas(nth_page(page, pfn_offset),
- vma, refs,
- likely(pages) ? pages + i : NULL,
- vmas ? vmas + i : NULL);
+ if (pages)
+ record_subpages(nth_page(page, pfn_offset),
+ vma, refs,
+ likely(pages) ? pages + i : NULL);
if (pages) {
/*
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 27f001e0f0a2..f42079b73f82 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -384,8 +384,9 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
}
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
- gfp_t gfp_mask, struct list_head *list)
+ struct list_head *list)
{
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE;
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
int nid = page_to_nid((struct page *)start);
struct page *page, *next;
@@ -413,12 +414,11 @@ out:
* @end: end address of the vmemmap virtual address range that we want to
* remap.
* @reuse: reuse address.
- * @gfp_mask: GFP flag for allocating vmemmap pages.
*
* Return: %0 on success, negative error code otherwise.
*/
static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
- unsigned long reuse, gfp_t gfp_mask)
+ unsigned long reuse)
{
LIST_HEAD(vmemmap_pages);
struct vmemmap_remap_walk walk = {
@@ -430,7 +430,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
/* See the comment in the vmemmap_remap_free(). */
BUG_ON(start - reuse != PAGE_SIZE);
- if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+ if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
return -ENOMEM;
mmap_read_lock(&init_mm);
@@ -476,8 +476,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
* When a HugeTLB page is freed to the buddy allocator, previously
* discarded vmemmap pages must be allocated and remapping.
*/
- ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
- GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
+ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse);
if (!ret) {
ClearHPageVmemmapOptimized(head);
static_branch_dec(&hugetlb_optimize_vmemmap_key);
diff --git a/mm/internal.h b/mm/internal.h
index 68410c6d97ac..bb6542279599 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -208,10 +208,12 @@ extern char * const zone_names[MAX_NR_ZONES];
/* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
-static inline bool is_check_pages_enabled(void)
-{
- return static_branch_unlikely(&check_pages_enabled);
-}
+extern int min_free_kbytes;
+
+void setup_per_zone_wmarks(void);
+void calculate_min_free_kbytes(void);
+int __meminit init_per_zone_wmark_min(void);
+void page_alloc_sysctl_init(void);
/*
* Structure for holding the mostly immutable allocation parameters passed
@@ -371,6 +373,13 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}
+void set_zone_contiguous(struct zone *zone);
+
+static inline void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
int mt);
@@ -416,6 +425,10 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
+void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
+ unsigned long, enum meminit_context, struct vmem_altmap *, int);
+
+
int split_free_page(struct page *free_page,
unsigned int order, unsigned long split_pfn_offset);
@@ -1047,17 +1060,17 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
{
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
- if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) {
- printk("%lu > %lu\n", vmi->mas.index, vma->vm_start);
- printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
- printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
- mt_dump(vmi->mas.tree);
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ vmi->mas.index > vma->vm_start)) {
+ pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
+ vmi->mas.index, vma->vm_start, vma->vm_start,
+ vma->vm_end, vmi->mas.index, vmi->mas.last);
}
- if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) {
- printk("%lu < %lu\n", vmi->mas.last, vma->vm_start);
- printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
- printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
- mt_dump(vmi->mas.tree);
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ vmi->mas.last < vma->vm_start)) {
+ pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
+ vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
+ vmi->mas.index, vmi->mas.last);
}
#endif
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b376a5d055e5..256930da578a 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -445,7 +445,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
bool __kasan_check_byte(const void *address, unsigned long ip)
{
if (!kasan_byte_accessible(address)) {
- kasan_report((unsigned long)address, 1, false, ip);
+ kasan_report(address, 1, false, ip);
return false;
}
return true;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index e5eef670735e..224d161a5a22 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -40,39 +40,39 @@
* depending on memory access size X.
*/
-static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+static __always_inline bool memory_is_poisoned_1(const void *addr)
{
- s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow(addr);
if (unlikely(shadow_value)) {
- s8 last_accessible_byte = addr & KASAN_GRANULE_MASK;
+ s8 last_accessible_byte = (unsigned long)addr & KASAN_GRANULE_MASK;
return unlikely(last_accessible_byte >= shadow_value);
}
return false;
}
-static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+static __always_inline bool memory_is_poisoned_2_4_8(const void *addr,
unsigned long size)
{
- u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow(addr);
/*
* Access crosses 8(shadow size)-byte boundary. Such access maps
* into 2 shadow bytes, so we need to check them both.
*/
- if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
+ if (unlikely((((unsigned long)addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
return memory_is_poisoned_1(addr + size - 1);
}
-static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+static __always_inline bool memory_is_poisoned_16(const void *addr)
{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow(addr);
/* Unaligned 16-bytes access maps into 3 shadow bytes. */
- if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE)))
+ if (unlikely(!IS_ALIGNED((unsigned long)addr, KASAN_GRANULE_SIZE)))
return *shadow_addr || memory_is_poisoned_1(addr + 15);
return *shadow_addr;
@@ -120,26 +120,25 @@ static __always_inline unsigned long memory_is_nonzero(const void *start,
return bytes_is_nonzero(start, (end - start) % 8);
}
-static __always_inline bool memory_is_poisoned_n(unsigned long addr,
- size_t size)
+static __always_inline bool memory_is_poisoned_n(const void *addr, size_t size)
{
unsigned long ret;
- ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
- kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+ ret = memory_is_nonzero(kasan_mem_to_shadow(addr),
+ kasan_mem_to_shadow(addr + size - 1) + 1);
if (unlikely(ret)) {
- unsigned long last_byte = addr + size - 1;
- s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+ const void *last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow(last_byte);
if (unlikely(ret != (unsigned long)last_shadow ||
- ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow)))
+ (((long)last_byte & KASAN_GRANULE_MASK) >= *last_shadow)))
return true;
}
return false;
}
-static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+static __always_inline bool memory_is_poisoned(const void *addr, size_t size)
{
if (__builtin_constant_p(size)) {
switch (size) {
@@ -159,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
-static __always_inline bool check_region_inline(unsigned long addr,
+static __always_inline bool check_region_inline(const void *addr,
size_t size, bool write,
unsigned long ret_ip)
{
@@ -172,7 +171,7 @@ static __always_inline bool check_region_inline(unsigned long addr,
if (unlikely(addr + size < addr))
return !kasan_report(addr, size, write, ret_ip);
- if (unlikely(!addr_has_metadata((void *)addr)))
+ if (unlikely(!addr_has_metadata(addr)))
return !kasan_report(addr, size, write, ret_ip);
if (likely(!memory_is_poisoned(addr, size)))
@@ -181,7 +180,7 @@ static __always_inline bool check_region_inline(unsigned long addr,
return !kasan_report(addr, size, write, ret_ip);
}
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(const void *addr, size_t size, bool write,
unsigned long ret_ip)
{
return check_region_inline(addr, size, write, ret_ip);
@@ -221,36 +220,37 @@ static void register_global(struct kasan_global *global)
KASAN_GLOBAL_REDZONE, false);
}
-void __asan_register_globals(struct kasan_global *globals, size_t size)
+void __asan_register_globals(void *ptr, ssize_t size)
{
int i;
+ struct kasan_global *globals = ptr;
for (i = 0; i < size; i++)
register_global(&globals[i]);
}
EXPORT_SYMBOL(__asan_register_globals);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+void __asan_unregister_globals(void *ptr, ssize_t size)
{
}
EXPORT_SYMBOL(__asan_unregister_globals);
#define DEFINE_ASAN_LOAD_STORE(size) \
- void __asan_load##size(unsigned long addr) \
+ void __asan_load##size(void *addr) \
{ \
check_region_inline(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_load##size); \
__alias(__asan_load##size) \
- void __asan_load##size##_noabort(unsigned long); \
+ void __asan_load##size##_noabort(void *); \
EXPORT_SYMBOL(__asan_load##size##_noabort); \
- void __asan_store##size(unsigned long addr) \
+ void __asan_store##size(void *addr) \
{ \
check_region_inline(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_store##size); \
__alias(__asan_store##size) \
- void __asan_store##size##_noabort(unsigned long); \
+ void __asan_store##size##_noabort(void *); \
EXPORT_SYMBOL(__asan_store##size##_noabort)
DEFINE_ASAN_LOAD_STORE(1);
@@ -259,24 +259,24 @@ DEFINE_ASAN_LOAD_STORE(4);
DEFINE_ASAN_LOAD_STORE(8);
DEFINE_ASAN_LOAD_STORE(16);
-void __asan_loadN(unsigned long addr, size_t size)
+void __asan_loadN(void *addr, ssize_t size)
{
kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_loadN);
__alias(__asan_loadN)
-void __asan_loadN_noabort(unsigned long, size_t);
+void __asan_loadN_noabort(void *, ssize_t);
EXPORT_SYMBOL(__asan_loadN_noabort);
-void __asan_storeN(unsigned long addr, size_t size)
+void __asan_storeN(void *addr, ssize_t size)
{
kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_storeN);
__alias(__asan_storeN)
-void __asan_storeN_noabort(unsigned long, size_t);
+void __asan_storeN_noabort(void *, ssize_t);
EXPORT_SYMBOL(__asan_storeN_noabort);
/* to shut up compiler complaints */
@@ -284,7 +284,7 @@ void __asan_handle_no_return(void) {}
EXPORT_SYMBOL(__asan_handle_no_return);
/* Emitted by compiler to poison alloca()ed objects. */
-void __asan_alloca_poison(unsigned long addr, size_t size)
+void __asan_alloca_poison(void *addr, ssize_t size)
{
size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE);
size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
@@ -295,7 +295,7 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
KASAN_ALLOCA_REDZONE_SIZE);
const void *right_redzone = (const void *)(addr + rounded_up_size);
- WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+ WARN_ON(!IS_ALIGNED((unsigned long)addr, KASAN_ALLOCA_REDZONE_SIZE));
kasan_unpoison((const void *)(addr + rounded_down_size),
size - rounded_down_size, false);
@@ -307,18 +307,18 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
EXPORT_SYMBOL(__asan_alloca_poison);
/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom)
{
- if (unlikely(!stack_top || stack_top > stack_bottom))
+ if (unlikely(!stack_top || stack_top > (void *)stack_bottom))
return;
- kasan_unpoison(stack_top, stack_bottom - stack_top, false);
+ kasan_unpoison(stack_top, (void *)stack_bottom - stack_top, false);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
/* Emitted by the compiler to [un]poison local variables. */
#define DEFINE_ASAN_SET_SHADOW(byte) \
- void __asan_set_shadow_##byte(const void *addr, size_t size) \
+ void __asan_set_shadow_##byte(const void *addr, ssize_t size) \
{ \
__memset((void *)addr, 0x##byte, size); \
} \
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index f5e4f5f2ba20..b799f11e45dc 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -198,13 +198,13 @@ enum kasan_report_type {
struct kasan_report_info {
/* Filled in by kasan_report_*(). */
enum kasan_report_type type;
- void *access_addr;
+ const void *access_addr;
size_t access_size;
bool is_write;
unsigned long ip;
/* Filled in by the common reporting code. */
- void *first_bad_addr;
+ const void *first_bad_addr;
struct kmem_cache *cache;
void *object;
size_t alloc_size;
@@ -311,7 +311,7 @@ static __always_inline bool addr_has_metadata(const void *addr)
* @ret_ip: return address
* @return: true if access was valid, false if invalid
*/
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(const void *addr, size_t size, bool write,
unsigned long ret_ip);
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
@@ -323,7 +323,7 @@ static __always_inline bool addr_has_metadata(const void *addr)
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-void *kasan_find_first_bad_addr(void *addr, size_t size);
+const void *kasan_find_first_bad_addr(const void *addr, size_t size);
size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache);
void kasan_complete_mode_report_info(struct kasan_report_info *info);
void kasan_metadata_fetch_row(char *buffer, void *row);
@@ -346,7 +346,7 @@ void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object);
static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { }
#endif
-bool kasan_report(unsigned long addr, size_t size,
+bool kasan_report(const void *addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type);
@@ -571,79 +571,82 @@ void kasan_restore_multi_shot(bool enabled);
*/
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
-void __asan_register_globals(struct kasan_global *globals, size_t size);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size);
+void __asan_register_globals(void *globals, ssize_t size);
+void __asan_unregister_globals(void *globals, ssize_t size);
void __asan_handle_no_return(void);
-void __asan_alloca_poison(unsigned long addr, size_t size);
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
-
-void __asan_load1(unsigned long addr);
-void __asan_store1(unsigned long addr);
-void __asan_load2(unsigned long addr);
-void __asan_store2(unsigned long addr);
-void __asan_load4(unsigned long addr);
-void __asan_store4(unsigned long addr);
-void __asan_load8(unsigned long addr);
-void __asan_store8(unsigned long addr);
-void __asan_load16(unsigned long addr);
-void __asan_store16(unsigned long addr);
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
-
-void __asan_load1_noabort(unsigned long addr);
-void __asan_store1_noabort(unsigned long addr);
-void __asan_load2_noabort(unsigned long addr);
-void __asan_store2_noabort(unsigned long addr);
-void __asan_load4_noabort(unsigned long addr);
-void __asan_store4_noabort(unsigned long addr);
-void __asan_load8_noabort(unsigned long addr);
-void __asan_store8_noabort(unsigned long addr);
-void __asan_load16_noabort(unsigned long addr);
-void __asan_store16_noabort(unsigned long addr);
-void __asan_loadN_noabort(unsigned long addr, size_t size);
-void __asan_storeN_noabort(unsigned long addr, size_t size);
-
-void __asan_report_load1_noabort(unsigned long addr);
-void __asan_report_store1_noabort(unsigned long addr);
-void __asan_report_load2_noabort(unsigned long addr);
-void __asan_report_store2_noabort(unsigned long addr);
-void __asan_report_load4_noabort(unsigned long addr);
-void __asan_report_store4_noabort(unsigned long addr);
-void __asan_report_load8_noabort(unsigned long addr);
-void __asan_report_store8_noabort(unsigned long addr);
-void __asan_report_load16_noabort(unsigned long addr);
-void __asan_report_store16_noabort(unsigned long addr);
-void __asan_report_load_n_noabort(unsigned long addr, size_t size);
-void __asan_report_store_n_noabort(unsigned long addr, size_t size);
-
-void __asan_set_shadow_00(const void *addr, size_t size);
-void __asan_set_shadow_f1(const void *addr, size_t size);
-void __asan_set_shadow_f2(const void *addr, size_t size);
-void __asan_set_shadow_f3(const void *addr, size_t size);
-void __asan_set_shadow_f5(const void *addr, size_t size);
-void __asan_set_shadow_f8(const void *addr, size_t size);
-
-void *__asan_memset(void *addr, int c, size_t len);
-void *__asan_memmove(void *dest, const void *src, size_t len);
-void *__asan_memcpy(void *dest, const void *src, size_t len);
-
-void __hwasan_load1_noabort(unsigned long addr);
-void __hwasan_store1_noabort(unsigned long addr);
-void __hwasan_load2_noabort(unsigned long addr);
-void __hwasan_store2_noabort(unsigned long addr);
-void __hwasan_load4_noabort(unsigned long addr);
-void __hwasan_store4_noabort(unsigned long addr);
-void __hwasan_load8_noabort(unsigned long addr);
-void __hwasan_store8_noabort(unsigned long addr);
-void __hwasan_load16_noabort(unsigned long addr);
-void __hwasan_store16_noabort(unsigned long addr);
-void __hwasan_loadN_noabort(unsigned long addr, size_t size);
-void __hwasan_storeN_noabort(unsigned long addr, size_t size);
-
-void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
-
-void *__hwasan_memset(void *addr, int c, size_t len);
-void *__hwasan_memmove(void *dest, const void *src, size_t len);
-void *__hwasan_memcpy(void *dest, const void *src, size_t len);
+void __asan_alloca_poison(void *, ssize_t size);
+void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom);
+
+void __asan_load1(void *);
+void __asan_store1(void *);
+void __asan_load2(void *);
+void __asan_store2(void *);
+void __asan_load4(void *);
+void __asan_store4(void *);
+void __asan_load8(void *);
+void __asan_store8(void *);
+void __asan_load16(void *);
+void __asan_store16(void *);
+void __asan_loadN(void *, ssize_t size);
+void __asan_storeN(void *, ssize_t size);
+
+void __asan_load1_noabort(void *);
+void __asan_store1_noabort(void *);
+void __asan_load2_noabort(void *);
+void __asan_store2_noabort(void *);
+void __asan_load4_noabort(void *);
+void __asan_store4_noabort(void *);
+void __asan_load8_noabort(void *);
+void __asan_store8_noabort(void *);
+void __asan_load16_noabort(void *);
+void __asan_store16_noabort(void *);
+void __asan_loadN_noabort(void *, ssize_t size);
+void __asan_storeN_noabort(void *, ssize_t size);
+
+void __asan_report_load1_noabort(void *);
+void __asan_report_store1_noabort(void *);
+void __asan_report_load2_noabort(void *);
+void __asan_report_store2_noabort(void *);
+void __asan_report_load4_noabort(void *);
+void __asan_report_store4_noabort(void *);
+void __asan_report_load8_noabort(void *);
+void __asan_report_store8_noabort(void *);
+void __asan_report_load16_noabort(void *);
+void __asan_report_store16_noabort(void *);
+void __asan_report_load_n_noabort(void *, ssize_t size);
+void __asan_report_store_n_noabort(void *, ssize_t size);
+
+void __asan_set_shadow_00(const void *addr, ssize_t size);
+void __asan_set_shadow_f1(const void *addr, ssize_t size);
+void __asan_set_shadow_f2(const void *addr, ssize_t size);
+void __asan_set_shadow_f3(const void *addr, ssize_t size);
+void __asan_set_shadow_f5(const void *addr, ssize_t size);
+void __asan_set_shadow_f8(const void *addr, ssize_t size);
+
+void *__asan_memset(void *addr, int c, ssize_t len);
+void *__asan_memmove(void *dest, const void *src, ssize_t len);
+void *__asan_memcpy(void *dest, const void *src, ssize_t len);
+
+void __hwasan_load1_noabort(void *);
+void __hwasan_store1_noabort(void *);
+void __hwasan_load2_noabort(void *);
+void __hwasan_store2_noabort(void *);
+void __hwasan_load4_noabort(void *);
+void __hwasan_store4_noabort(void *);
+void __hwasan_load8_noabort(void *);
+void __hwasan_store8_noabort(void *);
+void __hwasan_load16_noabort(void *);
+void __hwasan_store16_noabort(void *);
+void __hwasan_loadN_noabort(void *, ssize_t size);
+void __hwasan_storeN_noabort(void *, ssize_t size);
+
+void __hwasan_tag_memory(void *, u8 tag, ssize_t size);
+
+void *__hwasan_memset(void *addr, int c, ssize_t len);
+void *__hwasan_memmove(void *dest, const void *src, ssize_t len);
+void *__hwasan_memcpy(void *dest, const void *src, ssize_t len);
+
+void kasan_tag_mismatch(void *addr, unsigned long access_info,
+ unsigned long ret_ip);
#endif /* __MM_KASAN_KASAN_H */
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 892a9dc9d4d3..84d9f3b37014 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -211,7 +211,7 @@ static void start_report(unsigned long *flags, bool sync)
pr_err("==================================================================\n");
}
-static void end_report(unsigned long *flags, void *addr)
+static void end_report(unsigned long *flags, const void *addr)
{
if (addr)
trace_error_report_end(ERROR_DETECTOR_KASAN,
@@ -450,8 +450,8 @@ static void print_memory_metadata(const void *addr)
static void print_report(struct kasan_report_info *info)
{
- void *addr = kasan_reset_tag(info->access_addr);
- u8 tag = get_tag(info->access_addr);
+ void *addr = kasan_reset_tag((void *)info->access_addr);
+ u8 tag = get_tag((void *)info->access_addr);
print_error_description(info);
if (addr_has_metadata(addr))
@@ -468,12 +468,12 @@ static void print_report(struct kasan_report_info *info)
static void complete_report_info(struct kasan_report_info *info)
{
- void *addr = kasan_reset_tag(info->access_addr);
+ void *addr = kasan_reset_tag((void *)info->access_addr);
struct slab *slab;
if (info->type == KASAN_REPORT_ACCESS)
info->first_bad_addr = kasan_find_first_bad_addr(
- info->access_addr, info->access_size);
+ (void *)info->access_addr, info->access_size);
else
info->first_bad_addr = addr;
@@ -544,11 +544,10 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
* user_access_save/restore(): kasan_report_invalid_free() cannot be called
* from a UACCESS region, and kasan_report_async() is not used on x86.
*/
-bool kasan_report(unsigned long addr, size_t size, bool is_write,
+bool kasan_report(const void *addr, size_t size, bool is_write,
unsigned long ip)
{
bool ret = true;
- void *ptr = (void *)addr;
unsigned long ua_flags = user_access_save();
unsigned long irq_flags;
struct kasan_report_info info;
@@ -562,7 +561,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
memset(&info, 0, sizeof(info));
info.type = KASAN_REPORT_ACCESS;
- info.access_addr = ptr;
+ info.access_addr = addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
@@ -571,7 +570,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
print_report(&info);
- end_report(&irq_flags, ptr);
+ end_report(&irq_flags, (void *)addr);
out:
user_access_restore(ua_flags);
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 87d39bc0a673..51a1e8a8877f 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -30,9 +30,9 @@
#include "kasan.h"
#include "../slab.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
- void *p = addr;
+ const void *p = addr;
if (!addr_has_metadata(p))
return p;
@@ -362,14 +362,14 @@ void kasan_print_address_stack_frame(const void *addr)
#endif /* CONFIG_KASAN_STACK */
#define DEFINE_ASAN_REPORT_LOAD(size) \
-void __asan_report_load##size##_noabort(unsigned long addr) \
+void __asan_report_load##size##_noabort(void *addr) \
{ \
kasan_report(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_report_load##size##_noabort)
#define DEFINE_ASAN_REPORT_STORE(size) \
-void __asan_report_store##size##_noabort(unsigned long addr) \
+void __asan_report_store##size##_noabort(void *addr) \
{ \
kasan_report(addr, size, true, _RET_IP_); \
} \
@@ -386,13 +386,13 @@ DEFINE_ASAN_REPORT_STORE(4);
DEFINE_ASAN_REPORT_STORE(8);
DEFINE_ASAN_REPORT_STORE(16);
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+void __asan_report_load_n_noabort(void *addr, ssize_t size)
{
kasan_report(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_report_load_n_noabort);
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+void __asan_report_store_n_noabort(void *addr, ssize_t size)
{
kasan_report(addr, size, true, _RET_IP_);
}
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index 32e80f78de7d..065e1b2fc484 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -15,7 +15,7 @@
#include "kasan.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
/*
* Hardware Tag-Based KASAN only calls this function for normal memory
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 8b1f5a73ee6d..689e94f9fe3c 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -30,7 +30,7 @@
#include "kasan.h"
#include "../slab.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
u8 tag = get_tag(addr);
void *p = kasan_reset_tag(addr);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index c8b86f3273b5..3e62728ae25d 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -28,13 +28,13 @@
bool __kasan_check_read(const volatile void *p, unsigned int size)
{
- return kasan_check_range((unsigned long)p, size, false, _RET_IP_);
+ return kasan_check_range((void *)p, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_read);
bool __kasan_check_write(const volatile void *p, unsigned int size)
{
- return kasan_check_range((unsigned long)p, size, true, _RET_IP_);
+ return kasan_check_range((void *)p, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_write);
@@ -50,7 +50,7 @@ EXPORT_SYMBOL(__kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)
{
- if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+ if (!kasan_check_range(addr, len, true, _RET_IP_))
return NULL;
return __memset(addr, c, len);
@@ -60,8 +60,8 @@ void *memset(void *addr, int c, size_t len)
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memmove(dest, src, len);
@@ -71,17 +71,17 @@ void *memmove(void *dest, const void *src, size_t len)
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memcpy(dest, src, len);
}
#endif
-void *__asan_memset(void *addr, int c, size_t len)
+void *__asan_memset(void *addr, int c, ssize_t len)
{
- if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+ if (!kasan_check_range(addr, len, true, _RET_IP_))
return NULL;
return __memset(addr, c, len);
@@ -89,10 +89,10 @@ void *__asan_memset(void *addr, int c, size_t len)
EXPORT_SYMBOL(__asan_memset);
#ifdef __HAVE_ARCH_MEMMOVE
-void *__asan_memmove(void *dest, const void *src, size_t len)
+void *__asan_memmove(void *dest, const void *src, ssize_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memmove(dest, src, len);
@@ -100,10 +100,10 @@ void *__asan_memmove(void *dest, const void *src, size_t len)
EXPORT_SYMBOL(__asan_memmove);
#endif
-void *__asan_memcpy(void *dest, const void *src, size_t len)
+void *__asan_memcpy(void *dest, const void *src, ssize_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memcpy(dest, src, len);
@@ -111,13 +111,13 @@ void *__asan_memcpy(void *dest, const void *src, size_t len)
EXPORT_SYMBOL(__asan_memcpy);
#ifdef CONFIG_KASAN_SW_TAGS
-void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset);
+void *__hwasan_memset(void *addr, int c, ssize_t len) __alias(__asan_memset);
EXPORT_SYMBOL(__hwasan_memset);
#ifdef __HAVE_ARCH_MEMMOVE
-void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove);
+void *__hwasan_memmove(void *dest, const void *src, ssize_t len) __alias(__asan_memmove);
EXPORT_SYMBOL(__hwasan_memmove);
#endif
-void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy);
+void *__hwasan_memcpy(void *dest, const void *src, ssize_t len) __alias(__asan_memcpy);
EXPORT_SYMBOL(__hwasan_memcpy);
#endif
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 30da65fa02a1..220b5d4c6876 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -70,8 +70,8 @@ u8 kasan_random_tag(void)
return (u8)(state % (KASAN_TAG_MAX + 1));
}
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
- unsigned long ret_ip)
+bool kasan_check_range(const void *addr, size_t size, bool write,
+ unsigned long ret_ip)
{
u8 tag;
u8 *shadow_first, *shadow_last, *shadow;
@@ -133,12 +133,12 @@ bool kasan_byte_accessible(const void *addr)
}
#define DEFINE_HWASAN_LOAD_STORE(size) \
- void __hwasan_load##size##_noabort(unsigned long addr) \
+ void __hwasan_load##size##_noabort(void *addr) \
{ \
- kasan_check_range(addr, size, false, _RET_IP_); \
+ kasan_check_range(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
- void __hwasan_store##size##_noabort(unsigned long addr) \
+ void __hwasan_store##size##_noabort(void *addr) \
{ \
kasan_check_range(addr, size, true, _RET_IP_); \
} \
@@ -150,25 +150,25 @@ DEFINE_HWASAN_LOAD_STORE(4);
DEFINE_HWASAN_LOAD_STORE(8);
DEFINE_HWASAN_LOAD_STORE(16);
-void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
+void __hwasan_loadN_noabort(void *addr, ssize_t size)
{
kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_loadN_noabort);
-void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
+void __hwasan_storeN_noabort(void *addr, ssize_t size)
{
kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_storeN_noabort);
-void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
+void __hwasan_tag_memory(void *addr, u8 tag, ssize_t size)
{
- kasan_poison((void *)addr, size, tag, false);
+ kasan_poison(addr, size, tag, false);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
-void kasan_tag_mismatch(unsigned long addr, unsigned long access_info,
+void kasan_tag_mismatch(void *addr, unsigned long access_info,
unsigned long ret_ip)
{
kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4b27e245a055..d31fb1e2cb33 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -639,7 +639,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
}
}
-static void do_flush_stats(bool atomic)
+static void do_flush_stats(void)
{
/*
* We always flush the entire tree, so concurrent flushers can just
@@ -652,30 +652,16 @@ static void do_flush_stats(bool atomic)
WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
- if (atomic)
- cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
- else
- cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+ cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
atomic_set(&stats_flush_threshold, 0);
atomic_set(&stats_flush_ongoing, 0);
}
-static bool should_flush_stats(void)
-{
- return atomic_read(&stats_flush_threshold) > num_online_cpus();
-}
-
void mem_cgroup_flush_stats(void)
{
- if (should_flush_stats())
- do_flush_stats(false);
-}
-
-void mem_cgroup_flush_stats_atomic(void)
-{
- if (should_flush_stats())
- do_flush_stats(true);
+ if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+ do_flush_stats();
}
void mem_cgroup_flush_stats_ratelimited(void)
@@ -690,7 +676,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
* Always flush here so that flushing in latency-sensitive paths is
* as cheap as possible.
*/
- do_flush_stats(false);
+ do_flush_stats();
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
@@ -1580,13 +1566,10 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
+static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct seq_buf s;
int i;
- seq_buf_init(&s, buf, bufsize);
-
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
@@ -1603,21 +1586,21 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
u64 size;
size = memcg_page_state_output(memcg, memory_stats[i].idx);
- seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
+ seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
size += memcg_page_state_output(memcg,
NR_SLAB_RECLAIMABLE_B);
- seq_buf_printf(&s, "slab %llu\n", size);
+ seq_buf_printf(s, "slab %llu\n", size);
}
}
/* Accumulated memory events */
- seq_buf_printf(&s, "pgscan %lu\n",
+ seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT) +
memcg_events(memcg, PGSCAN_KHUGEPAGED));
- seq_buf_printf(&s, "pgsteal %lu\n",
+ seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT) +
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
@@ -1627,13 +1610,24 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
memcg_vm_event_stat[i] == PGPGOUT)
continue;
- seq_buf_printf(&s, "%s %lu\n",
+ seq_buf_printf(s, "%s %lu\n",
vm_event_name(memcg_vm_event_stat[i]),
memcg_events(memcg, memcg_vm_event_stat[i]));
}
/* The above should easily fit into one page */
- WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
+}
+
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+
+static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memcg_stat_format(memcg, s);
+ else
+ memcg1_stat_format(memcg, s);
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -1671,6 +1665,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[PAGE_SIZE];
+ struct seq_buf s;
lockdep_assert_held(&oom_lock);
@@ -1693,8 +1688,9 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
- memory_stat_format(memcg, buf, sizeof(buf));
- pr_info("%s", buf);
+ seq_buf_init(&s, buf, sizeof(buf));
+ memory_stat_format(memcg, &s);
+ seq_buf_do_printk(&s, KERN_INFO);
}
/*
@@ -2028,26 +2024,12 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked)
mem_cgroup_oom_notify(memcg);
- if (locked && !READ_ONCE(memcg->oom_kill_disable)) {
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
- current->memcg_oom_order);
- } else {
- schedule();
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- }
+ schedule();
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
- if (locked) {
+ if (locked)
mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitly.
- */
- memcg_oom_recover(memcg);
- }
cleanup:
current->memcg_in_oom = NULL;
css_put(&memcg->css);
@@ -2275,7 +2257,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
+ if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) {
stock->nr_pages -= nr_pages;
ret = true;
}
@@ -2290,7 +2272,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
- struct mem_cgroup *old = stock->cached;
+ struct mem_cgroup *old = READ_ONCE(stock->cached);
if (!old)
return;
@@ -2303,7 +2285,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
}
css_put(&old->css);
- stock->cached = NULL;
+ WRITE_ONCE(stock->cached, NULL);
}
static void drain_local_stock(struct work_struct *dummy)
@@ -2338,10 +2320,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
struct memcg_stock_pcp *stock;
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached != memcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
drain_stock(stock);
css_get(&memcg->css);
- stock->cached = memcg;
+ WRITE_ONCE(stock->cached, memcg);
}
stock->nr_pages += nr_pages;
@@ -2383,7 +2365,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
bool flush = false;
rcu_read_lock();
- memcg = stock->cached;
+ memcg = READ_ONCE(stock->cached);
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
@@ -3208,12 +3190,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
* accumulating over a page of vmstat data or when pgdat or idx
* changes.
*/
- if (stock->cached_objcg != objcg) {
+ if (READ_ONCE(stock->cached_objcg) != objcg) {
old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- stock->cached_objcg = objcg;
+ WRITE_ONCE(stock->cached_objcg, objcg);
stock->cached_pgdat = pgdat;
} else if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
@@ -3267,7 +3249,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
@@ -3279,7 +3261,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
- struct obj_cgroup *old = stock->cached_objcg;
+ struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
return NULL;
@@ -3332,7 +3314,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
stock->cached_pgdat = NULL;
}
- stock->cached_objcg = NULL;
+ WRITE_ONCE(stock->cached_objcg, NULL);
/*
* The `old' objects needs to be released by the caller via
* obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
@@ -3343,10 +3325,11 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
+ struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
- if (stock->cached_objcg) {
- memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (objcg) {
+ memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3365,10 +3348,10 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached_objcg != objcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
- stock->cached_objcg = objcg;
+ WRITE_ONCE(stock->cached_objcg, objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
allow_uncharge = true; /* Allow uncharge when objcg changes */
@@ -3699,27 +3682,13 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
if (mem_cgroup_is_root(memcg)) {
/*
- * We can reach here from irq context through:
- * uncharge_batch()
- * |--memcg_check_events()
- * |--mem_cgroup_threshold()
- * |--__mem_cgroup_threshold()
- * |--mem_cgroup_usage
- *
- * rstat flushing is an expensive operation that should not be
- * done from irq context; use stale stats in this case.
- * Arguably, usage threshold events are not reliable on the root
- * memcg anyway since its usage is ill-defined.
- *
- * Additionally, other call paths through memcg_check_events()
- * disable irqs, so make sure we are flushing stats atomically.
+ * Approximate root's usage from global state. This isn't
+ * perfect, but the root usage was always an approximation.
*/
- if (in_task())
- mem_cgroup_flush_stats_atomic();
- val = memcg_page_state(memcg, NR_FILE_PAGES) +
- memcg_page_state(memcg, NR_ANON_MAPPED);
+ val = global_node_page_state(NR_FILE_PAGES) +
+ global_node_page_state(NR_ANON_MAPPED);
if (swap)
- val += memcg_page_state(memcg, MEMCG_SWAP);
+ val += total_swap_pages - get_nr_swap_pages();
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -4135,9 +4104,8 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
-static int memcg_stat_show(struct seq_file *m, void *v)
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
@@ -4152,18 +4120,18 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+ seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
- memcg_events_local(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %lu\n", lru_list_name(i),
- memcg_page_state_local(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -4171,11 +4139,11 @@ static int memcg_stat_show(struct seq_file *m, void *v)
memory = min(memory, READ_ONCE(mi->memory.max));
memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
- seq_printf(m, "hierarchical_memory_limit %llu\n",
- (u64)memory * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
+ (u64)memory * PAGE_SIZE);
if (do_memsw_account())
- seq_printf(m, "hierarchical_memsw_limit %llu\n",
- (u64)memsw * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
+ (u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4183,19 +4151,19 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "total_%s %llu\n",
- vm_event_name(memcg1_events[i]),
- (u64)memcg_events(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "total_%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -4210,12 +4178,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "anon_cost %lu\n", anon_cost);
- seq_printf(m, "file_cost %lu\n", file_cost);
+ seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
+ seq_buf_printf(s, "file_cost %lu\n", file_cost);
}
#endif
-
- return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
@@ -4648,11 +4614,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- /*
- * wb_writeback() takes a spinlock and calls
- * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep.
- */
- mem_cgroup_flush_stats_atomic();
+ mem_cgroup_flush_stats();
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5059,6 +5021,8 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
}
#endif
+static int memory_stat_show(struct seq_file *m, void *v);
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
@@ -5091,7 +5055,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
},
{
.name = "stat",
- .seq_show = memcg_stat_show,
+ .seq_show = memory_stat_show,
},
{
.name = "force_empty",
@@ -6634,10 +6598,12 @@ static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ struct seq_buf s;
if (!buf)
return -ENOMEM;
- memory_stat_format(memcg, buf, PAGE_SIZE);
+ seq_buf_init(&s, buf, PAGE_SIZE);
+ memory_stat_format(memcg, &s);
seq_puts(m, buf);
kfree(buf);
return 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5b663eca1f29..004a02f44271 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -123,7 +123,6 @@ const struct attribute_group memory_failure_attr_group = {
.attrs = memory_failure_attr,
};
-#ifdef CONFIG_SYSCTL
static struct ctl_table memory_failure_table[] = {
{
.procname = "memory_failure_early_kill",
@@ -146,14 +145,6 @@ static struct ctl_table memory_failure_table[] = {
{ }
};
-static int __init memory_failure_sysctl_init(void)
-{
- register_sysctl_init("vm", memory_failure_table);
- return 0;
-}
-late_initcall(memory_failure_sysctl_init);
-#endif /* CONFIG_SYSCTL */
-
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -2441,6 +2432,8 @@ static int __init memory_failure_init(void)
INIT_WORK(&mf_cpu->work, memory_failure_work_func);
}
+ register_sysctl_init("vm", memory_failure_table);
+
return 0;
}
core_initcall(memory_failure_init);
diff --git a/mm/memory.c b/mm/memory.c
index 146bb94764f8..f2ae86338545 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5590,7 +5590,6 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
int len, unsigned int gup_flags)
{
- struct vm_area_struct *vma;
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
@@ -5599,29 +5598,30 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
/* ignore errors, just check how much was successfully transferred */
while (len) {
- int bytes, ret, offset;
+ int bytes, offset;
void *maddr;
- struct page *page = NULL;
+ struct vm_area_struct *vma;
+ struct page *page = get_user_page_vma_remote(mm, addr,
+ gup_flags, &vma);
- ret = get_user_pages_remote(mm, addr, 1,
- gup_flags, &page, &vma, NULL);
- if (ret <= 0) {
+ if (IS_ERR_OR_NULL(page)) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
#else
+ int res = 0;
+
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
- vma = vma_lookup(mm, addr);
if (!vma)
break;
if (vma->vm_ops && vma->vm_ops->access)
- ret = vma->vm_ops->access(vma, addr, buf,
+ res = vma->vm_ops->access(vma, addr, buf,
len, write);
- if (ret <= 0)
+ if (res <= 0)
break;
- bytes = ret;
+ bytes = res;
#endif
} else {
bytes = len;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e0fa209d533..9061ac69b1b6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -325,7 +325,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
}
if (check_pfn_span(pfn, nr_pages)) {
- WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+ WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1);
return -EINVAL;
}
@@ -525,7 +525,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
map_offset = vmem_altmap_offset(altmap);
if (check_pfn_span(pfn, nr_pages)) {
- WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+ WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1);
return;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 01cac26a3127..cb292d2a90ce 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -692,37 +692,32 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
{
struct buffer_head *bh = head;
+ struct buffer_head *failed_bh;
- /* Simple case, sync compaction */
- if (mode != MIGRATE_ASYNC) {
- do {
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
-
- return true;
- }
-
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
do {
if (!trylock_buffer(bh)) {
- /*
- * We failed to lock the buffer and cannot stall in
- * async migration. Release the taken locks
- */
- struct buffer_head *failed_bh = bh;
- bh = head;
- while (bh != failed_bh) {
- unlock_buffer(bh);
- bh = bh->b_this_page;
- }
- return false;
+ if (mode == MIGRATE_ASYNC)
+ goto unlock;
+ if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh))
+ goto unlock;
+ lock_buffer(bh);
}
bh = bh->b_this_page;
} while (bh != head);
+
return true;
+
+unlock:
+ /* We failed to lock the buffer and cannot stall. */
+ failed_bh = bh;
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ bh = bh->b_this_page;
+ }
+
+ return false;
}
static int __buffer_migrate_folio(struct address_space *mapping,
@@ -1156,6 +1151,14 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
if (current->flags & PF_MEMALLOC)
goto out;
+ /*
+ * In "light" mode, we can wait for transient locks (eg
+ * inserting a page into the page table), but it's not
+ * worth waiting for I/O.
+ */
+ if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src))
+ goto out;
+
folio_lock(src);
}
locked = true;
@@ -1614,13 +1617,10 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
int nr_pass)
{
int retry = 1;
- int large_retry = 1;
int thp_retry = 1;
int nr_failed = 0;
int nr_retry_pages = 0;
- int nr_large_failed = 0;
int pass = 0;
- bool is_large = false;
bool is_thp = false;
struct folio *folio, *folio2, *dst = NULL, *dst2;
int rc, rc_saved = 0, nr_pages;
@@ -1631,20 +1631,13 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
!list_empty(from) && !list_is_singular(from));
- for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
+ for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
- large_retry = 0;
thp_retry = 0;
nr_retry_pages = 0;
list_for_each_entry_safe(folio, folio2, from, lru) {
- /*
- * Large folio statistics is based on the source large
- * folio. Capture required information that might get
- * lost during migration.
- */
- is_large = folio_test_large(folio);
- is_thp = is_large && folio_test_pmd_mappable(folio);
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
@@ -1660,7 +1653,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* list is processed.
*/
if (!thp_migration_supported() && is_thp) {
- nr_large_failed++;
+ nr_failed++;
stats->nr_thp_failed++;
if (!try_split_folio(folio, split_folios)) {
stats->nr_thp_split++;
@@ -1688,38 +1681,33 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* When memory is low, don't bother to try to migrate
* other folios, move unmapped folios, then exit.
*/
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- /* Large folio NUMA faulting doesn't split to retry. */
- if (!nosplit) {
- int ret = try_split_folio(folio, split_folios);
-
- if (!ret) {
- stats->nr_thp_split += is_thp;
- break;
- } else if (reason == MR_LONGTERM_PIN &&
- ret == -EAGAIN) {
- /*
- * Try again to split large folio to
- * mitigate the failure of longterm pinning.
- */
- large_retry++;
- thp_retry += is_thp;
- nr_retry_pages += nr_pages;
- /* Undo duplicated failure counting. */
- nr_large_failed--;
- stats->nr_thp_failed -= is_thp;
- break;
- }
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
+ /* Large folio NUMA faulting doesn't split to retry. */
+ if (folio_test_large(folio) && !nosplit) {
+ int ret = try_split_folio(folio, split_folios);
+
+ if (!ret) {
+ stats->nr_thp_split += is_thp;
+ break;
+ } else if (reason == MR_LONGTERM_PIN &&
+ ret == -EAGAIN) {
+ /*
+ * Try again to split large folio to
+ * mitigate the failure of longterm pinning.
+ */
+ retry++;
+ thp_retry += is_thp;
+ nr_retry_pages += nr_pages;
+ /* Undo duplicated failure counting. */
+ nr_failed--;
+ stats->nr_thp_failed -= is_thp;
+ break;
}
- } else {
- nr_failed++;
}
stats->nr_failed_pages += nr_pages + nr_retry_pages;
/* nr_failed isn't updated for not used */
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
rc_saved = rc;
if (list_empty(&unmap_folios))
@@ -1727,12 +1715,8 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
else
goto move;
case -EAGAIN:
- if (is_large) {
- large_retry++;
- thp_retry += is_thp;
- } else {
- retry++;
- }
+ retry++;
+ thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
@@ -1750,20 +1734,14 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* removed from migration folio list and not
* retried in the next outer loop.
*/
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- } else {
- nr_failed++;
- }
-
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
stats->nr_failed_pages += nr_pages;
break;
}
}
}
nr_failed += retry;
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
stats->nr_failed_pages += nr_retry_pages;
move:
@@ -1771,17 +1749,15 @@ move:
try_to_unmap_flush();
retry = 1;
- for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
+ for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
- large_retry = 0;
thp_retry = 0;
nr_retry_pages = 0;
dst = list_first_entry(&dst_folios, struct folio, lru);
dst2 = list_next_entry(dst, lru);
list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- is_large = folio_test_large(folio);
- is_thp = is_large && folio_test_pmd_mappable(folio);
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
@@ -1797,12 +1773,8 @@ move:
*/
switch(rc) {
case -EAGAIN:
- if (is_large) {
- large_retry++;
- thp_retry += is_thp;
- } else {
- retry++;
- }
+ retry++;
+ thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
@@ -1810,13 +1782,8 @@ move:
stats->nr_thp_succeeded += is_thp;
break;
default:
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- } else {
- nr_failed++;
- }
-
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
stats->nr_failed_pages += nr_pages;
break;
}
@@ -1825,14 +1792,10 @@ move:
}
}
nr_failed += retry;
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
stats->nr_failed_pages += nr_retry_pages;
- if (rc_saved)
- rc = rc_saved;
- else
- rc = nr_failed + nr_large_failed;
+ rc = rc_saved ? : nr_failed;
out:
/* Cleanup remaining folios */
dst = list_first_entry(&dst_folios, struct folio, lru);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7f7f9c677854..10bf560302c4 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -259,6 +259,8 @@ static int __init cmdline_parse_core(char *p, unsigned long *core,
return 0;
}
+bool mirrored_kernelcore __initdata_memblock;
+
/*
* kernelcore=size sets the amount of memory for use for allocations that
* cannot be reclaimed or migrated.
@@ -2328,6 +2330,28 @@ void __init init_cma_reserved_pageblock(struct page *page)
}
#endif
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = pageblock_end_pfn(block_start_pfn);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ cond_resched();
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
void __init page_alloc_init_late(void)
{
struct zone *zone;
@@ -2368,6 +2392,8 @@ void __init page_alloc_init_late(void)
/* Initialize page ext after all struct pages are initialized. */
if (deferred_struct_pages)
page_ext_init();
+
+ page_alloc_sysctl_init();
}
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
@@ -2541,6 +2567,12 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
__free_pages_core(page, order);
}
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
+EXPORT_SYMBOL(init_on_alloc);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
+EXPORT_SYMBOL(init_on_free);
+
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
diff --git a/mm/mmap.c b/mm/mmap.c
index 13678edaa22c..877696464c09 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -299,62 +299,44 @@ out:
return origbrk;
}
-#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
-extern void mt_validate(struct maple_tree *mt);
-extern void mt_dump(const struct maple_tree *mt);
-
-/* Validate the maple tree */
-static void validate_mm_mt(struct mm_struct *mm)
-{
- struct maple_tree *mt = &mm->mm_mt;
- struct vm_area_struct *vma_mt;
-
- MA_STATE(mas, mt, 0, 0);
-
- mt_validate(&mm->mm_mt);
- mas_for_each(&mas, vma_mt, ULONG_MAX) {
- if ((vma_mt->vm_start != mas.index) ||
- (vma_mt->vm_end - 1 != mas.last)) {
- pr_emerg("issue in %s\n", current->comm);
- dump_stack();
- dump_vma(vma_mt);
- pr_emerg("mt piv: %p %lu - %lu\n", vma_mt,
- mas.index, mas.last);
- pr_emerg("mt vma: %p %lu - %lu\n", vma_mt,
- vma_mt->vm_start, vma_mt->vm_end);
-
- mt_dump(mas.tree);
- if (vma_mt->vm_end != mas.last + 1) {
- pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n",
- mm, vma_mt->vm_start, vma_mt->vm_end,
- mas.index, mas.last);
- mt_dump(mas.tree);
- }
- VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm);
- if (vma_mt->vm_start != mas.index) {
- pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n",
- mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end);
- mt_dump(mas.tree);
- }
- VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm);
- }
- }
-}
-
+#if defined(CONFIG_DEBUG_VM)
static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
struct vm_area_struct *vma;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, mm, 0);
- validate_mm_mt(mm);
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ mt_validate(&mm->mm_mt);
+#endif
- mas_for_each(&mas, vma, ULONG_MAX) {
+ for_each_vma(vmi, vma) {
#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
+#endif
+ unsigned long vmi_start, vmi_end;
+ bool warn = 0;
+
+ vmi_start = vma_iter_addr(&vmi);
+ vmi_end = vma_iter_end(&vmi);
+ if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
+ warn = 1;
+
+ if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
+ warn = 1;
+
+ if (warn) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma);
+ pr_emerg("tree range: %px start %lx end %lx\n", vma,
+ vmi_start, vmi_end - 1);
+ vma_iter_dump_tree(&vmi);
+ }
+#ifdef CONFIG_DEBUG_VM_RB
if (anon_vma) {
anon_vma_lock_read(anon_vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
@@ -365,16 +347,15 @@ static void validate_mm(struct mm_struct *mm)
i++;
}
if (i != mm->map_count) {
- pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i);
+ pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
-#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
-#define validate_mm_mt(root) do { } while (0)
+#else /* !CONFIG_DEBUG_VM */
#define validate_mm(mm) do { } while (0)
-#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
+#endif /* CONFIG_DEBUG_VM */
/*
* vma has some anon_vma assigned, and is already inserted on that
@@ -1475,6 +1456,48 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
+{
+ return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
+}
+
+static bool vma_is_shared_writable(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
+ (VM_WRITE | VM_SHARED);
+}
+
+static bool vma_fs_can_writeback(struct vm_area_struct *vma)
+{
+ /* No managed pages to writeback. */
+ if (vma->vm_flags & VM_PFNMAP)
+ return false;
+
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_can_writeback(vma->vm_file->f_mapping);
+}
+
+/*
+ * Does this VMA require the underlying folios to have their dirty state
+ * tracked?
+ */
+bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
+{
+ /* Only shared, writable VMAs require dirty tracking. */
+ if (!vma_is_shared_writable(vma))
+ return false;
+
+ /* Does the filesystem need to be notified? */
+ if (vm_ops_needs_writenotify(vma->vm_ops))
+ return true;
+
+ /*
+ * Even if the filesystem doesn't indicate a need for writenotify, if it
+ * can writeback, dirty tracking is still required.
+ */
+ return vma_fs_can_writeback(vma);
+}
+
/*
* Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
@@ -1483,21 +1506,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
*/
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
- vm_flags_t vm_flags = vma->vm_flags;
- const struct vm_operations_struct *vm_ops = vma->vm_ops;
-
/* If it was private or non-writable, the write bit is already clear */
- if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ if (!vma_is_shared_writable(vma))
return 0;
/* The backer wishes to know when pages are first written to? */
- if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
+ if (vm_ops_needs_writenotify(vma->vm_ops))
return 1;
/* The open routine did something to the protections that pgprot_modify
* won't preserve? */
if (pgprot_val(vm_page_prot) !=
- pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
+ pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
return 0;
/*
@@ -1511,13 +1531,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
if (userfaultfd_wp(vma))
return 1;
- /* Specialty mapping? */
- if (vm_flags & VM_PFNMAP)
- return 0;
-
/* Can the mapping track the dirty pages? */
- return vma->vm_file && vma->vm_file->f_mapping &&
- mapping_can_writeback(vma->vm_file->f_mapping);
+ return vma_fs_can_writeback(vma);
}
/*
@@ -2234,7 +2249,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct *new;
int err;
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
WARN_ON(vma->vm_start >= addr);
WARN_ON(vma->vm_end <= addr);
@@ -2292,7 +2307,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* Success. */
if (new_below)
vma_next(vmi);
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
return 0;
out_free_mpol:
@@ -2301,7 +2316,7 @@ out_free_vmi:
vma_iter_free(vmi);
out_free_vma:
vm_area_free(new);
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
return err;
}
@@ -2410,7 +2425,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
#endif
}
- next = vma_next(vmi);
+ if (vma_iter_end(vmi) > end)
+ next = vma_iter_load(vmi);
+
+ if (!next)
+ next = vma_next(vmi);
+
if (unlikely(uf)) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
@@ -2623,6 +2643,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
cannot_expand:
+ if (prev)
+ vma_iter_next_range(&vmi);
+
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
@@ -2936,7 +2959,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
arch_unmap(mm, start, end);
ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
- validate_mm_mt(mm);
+ validate_mm(mm);
return ret;
}
@@ -2958,7 +2981,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm = current->mm;
struct vma_prepare vp;
- validate_mm_mt(mm);
+ validate_mm(mm);
/*
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
@@ -3199,7 +3222,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
bool faulted_in_anon_vma = true;
VMA_ITERATOR(vmi, mm, addr);
- validate_mm_mt(mm);
+ validate_mm(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3258,7 +3281,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
goto out_vma_link;
*need_rmap_locks = false;
}
- validate_mm_mt(mm);
+ validate_mm(mm);
return new_vma;
out_vma_link:
@@ -3274,7 +3297,7 @@ out_free_mempol:
out_free_vma:
vm_area_free(new_vma);
out:
- validate_mm_mt(mm);
+ validate_mm(mm);
return NULL;
}
@@ -3411,7 +3434,7 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
- validate_mm_mt(mm);
+ validate_mm(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3434,12 +3457,12 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
- validate_mm_mt(mm);
+ validate_mm(mm);
return vma;
out:
vm_area_free(vma);
- validate_mm_mt(mm);
+ validate_mm(mm);
return ERR_PTR(ret);
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 044e1eed720e..612b5597d3af 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1130,12 +1130,10 @@ bool out_of_memory(struct oom_control *oc)
/*
* The OOM killer does not compensate for IO-less reclaim.
- * pagefault_out_of_memory lost its gfp context so we have to
- * make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
- * invoke the OOM killer even if it is a GFP_NOFS allocation.
+ * But mem_cgroup_oom() has to invoke the OOM killer even
+ * if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
+ if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47421bedc12b..1023f41de2fb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,12 +18,8 @@
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
#include <linux/interrupt.h>
-#include <linux/pagemap.h>
#include <linux/jiffies.h>
-#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
@@ -31,8 +27,6 @@
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
@@ -41,19 +35,10 @@
#include <linux/cpuset.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
-#include <linux/vmalloc.h>
#include <linux/vmstat.h>
-#include <linux/mempolicy.h>
-#include <linux/memremap.h>
-#include <linux/stop_machine.h>
-#include <linux/random.h>
#include <linux/sort.h>
#include <linux/pfn.h>
-#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
-#include <linux/page-isolation.h>
-#include <linux/debugobjects.h>
-#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
@@ -61,12 +46,9 @@
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
-#include <linux/hugetlb.h>
-#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_table_check.h>
-#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
@@ -74,13 +56,10 @@
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
-#include <asm/sections.h>
-#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
-#include "swap.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
@@ -227,18 +206,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
};
EXPORT_SYMBOL(node_states);
-atomic_long_t _totalram_pages __read_mostly;
-EXPORT_SYMBOL(_totalram_pages);
-unsigned long totalreserve_pages __read_mostly;
-unsigned long totalcma_pages __read_mostly;
-
-int percpu_pagelist_high_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
-EXPORT_SYMBOL(init_on_alloc);
-
-DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
-EXPORT_SYMBOL(init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
@@ -258,44 +226,6 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
page->index = migratetype;
}
-#ifdef CONFIG_PM_SLEEP
-/*
- * The following functions are used by the suspend/hibernate code to temporarily
- * change gfp_allowed_mask in order to avoid using I/O during memory allocations
- * while devices are suspended. To avoid races with the suspend/hibernate code,
- * they should always be called with system_transition_mutex held
- * (gfp_allowed_mask also should only be modified with system_transition_mutex
- * held, unless the suspend/hibernate code is guaranteed not to run in parallel
- * with that modification).
- */
-
-static gfp_t saved_gfp_mask;
-
-void pm_restore_gfp_mask(void)
-{
- WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (saved_gfp_mask) {
- gfp_allowed_mask = saved_gfp_mask;
- saved_gfp_mask = 0;
- }
-}
-
-void pm_restrict_gfp_mask(void)
-{
- WARN_ON(!mutex_is_locked(&system_transition_mutex));
- WARN_ON(saved_gfp_mask);
- saved_gfp_mask = gfp_allowed_mask;
- gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
-}
-
-bool pm_suspended_storage(void)
-{
- if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
- return false;
- return true;
-}
-#endif /* CONFIG_PM_SLEEP */
-
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif
@@ -371,10 +301,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
-int watermark_boost_factor __read_mostly = 15000;
-int watermark_scale_factor = 10;
-
-bool mirrored_kernelcore __initdata_memblock;
+static int watermark_boost_factor __read_mostly = 15000;
+static int watermark_scale_factor = 10;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -550,13 +478,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
return ret;
}
-static int page_is_consistent(struct zone *zone, struct page *page)
-{
- if (zone != page_zone(page))
- return 0;
-
- return 1;
-}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
@@ -564,7 +485,7 @@ static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
- if (!page_is_consistent(zone, page))
+ if (zone != page_zone(page))
return 1;
return 0;
@@ -704,75 +625,6 @@ void destroy_large_folio(struct folio *folio)
compound_page_dtors[dtor](&folio->page);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
-unsigned int _debug_guardpage_minorder;
-
-bool _debug_pagealloc_enabled_early __read_mostly
- = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
-EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
-DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
-EXPORT_SYMBOL(_debug_pagealloc_enabled);
-
-DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
-
-static int __init early_debug_pagealloc(char *buf)
-{
- return kstrtobool(buf, &_debug_pagealloc_enabled_early);
-}
-early_param("debug_pagealloc", early_debug_pagealloc);
-
-static int __init debug_guardpage_minorder_setup(char *buf)
-{
- unsigned long res;
-
- if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
- return 0;
- }
- _debug_guardpage_minorder = res;
- pr_info("Setting debug_guardpage_minorder to %lu\n", res);
- return 0;
-}
-early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
-
-static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
-{
- if (!debug_guardpage_enabled())
- return false;
-
- if (order >= debug_guardpage_minorder())
- return false;
-
- __SetPageGuard(page);
- INIT_LIST_HEAD(&page->buddy_list);
- set_page_private(page, order);
- /* Guard pages are not available for any usage */
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, -(1 << order), migratetype);
-
- return true;
-}
-
-static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
-{
- if (!debug_guardpage_enabled())
- return;
-
- __ClearPageGuard(page);
-
- set_page_private(page, 0);
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, (1 << order), migratetype);
-}
-#else
-static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) { return false; }
-static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
-#endif
-
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
@@ -1131,6 +983,11 @@ static inline bool free_page_is_bad(struct page *page)
return true;
}
+static inline bool is_check_pages_enabled(void)
+{
+ return static_branch_unlikely(&check_pages_enabled);
+}
+
static int free_tail_page_prepare(struct page *head_page, struct page *page)
{
struct folio *folio = (struct folio *)head_page;
@@ -1142,7 +999,7 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
*/
BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
- if (!static_branch_unlikely(&check_pages_enabled)) {
+ if (!is_check_pages_enabled()) {
ret = 0;
goto out;
}
@@ -1521,7 +1378,7 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
/* end_pfn is one past the range we are checking */
end_pfn--;
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ if (!pfn_valid(end_pfn))
return NULL;
start_page = pfn_to_online_page(start_pfn);
@@ -1540,33 +1397,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
return start_page;
}
-void set_zone_contiguous(struct zone *zone)
-{
- unsigned long block_start_pfn = zone->zone_start_pfn;
- unsigned long block_end_pfn;
-
- block_end_pfn = pageblock_end_pfn(block_start_pfn);
- for (; block_start_pfn < zone_end_pfn(zone);
- block_start_pfn = block_end_pfn,
- block_end_pfn += pageblock_nr_pages) {
-
- block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
-
- if (!__pageblock_pfn_to_page(block_start_pfn,
- block_end_pfn, zone))
- return;
- cond_resched();
- }
-
- /* We confirm that there is no hole */
- zone->contiguous = true;
-}
-
-void clear_zone_contiguous(struct zone *zone)
-{
- zone->contiguous = false;
-}
-
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
@@ -2275,6 +2105,43 @@ do_steal:
}
+#ifdef CONFIG_CMA
+/*
+ * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via
+ * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok
+ * again without ALLOC_CMA to see if to use CMA first.
+ */
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+ unsigned long watermark;
+ bool cma_first = false;
+
+ watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+ /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */
+ if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) {
+ /*
+ * Balance movable allocations between regular and CMA areas by
+ * allocating from CMA when over half of the zone's free memory
+ * is in the CMA area.
+ */
+ cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) >
+ zone_page_state(zone, NR_FREE_PAGES) / 2);
+ } else {
+ /*
+ * watermark failed means UNMOVABLE & RECLAIMBLE is not enough
+ * now, we should use cma first to keep them stay around the
+ * corresponding watermark
+ */
+ cma_first = true;
+ }
+ return cma_first;
+}
+#else
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+ return false;
+}
+#endif
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
@@ -2288,12 +2155,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
if (IS_ENABLED(CONFIG_CMA)) {
/*
* Balance movable allocations between regular and CMA areas by
- * allocating from CMA when over half of the zone's free memory
- * is in the CMA area.
+ * allocating from CMA base on judging zone_watermark_ok again
+ * to see if the latest check got pass via the help of CMA
*/
if (alloc_flags & ALLOC_CMA &&
- zone_page_state(zone, NR_FREE_CMA_PAGES) >
- zone_page_state(zone, NR_FREE_PAGES) / 2) {
+ use_cma_first(zone, order, alloc_flags)) {
page = __rmqueue_cma_fallback(zone, order);
if (page)
return page;
@@ -2501,61 +2367,6 @@ void drain_all_pages(struct zone *zone)
__drain_all_pages(zone, false);
}
-#ifdef CONFIG_HIBERNATION
-
-/*
- * Touch the watchdog for every WD_PAGE_COUNT pages.
- */
-#define WD_PAGE_COUNT (128*1024)
-
-void mark_free_pages(struct zone *zone)
-{
- unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
- unsigned long flags;
- unsigned int order, t;
- struct page *page;
-
- if (zone_is_empty(zone))
- return;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
-
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
-
- if (page_zone(page) != zone)
- continue;
-
- if (!swsusp_page_is_forbidden(page))
- swsusp_unset_page_free(page);
- }
-
- for_each_migratetype_order(order, t) {
- list_for_each_entry(page,
- &zone->free_area[order].free_list[t], buddy_list) {
- unsigned long i;
-
- pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++) {
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
- swsusp_set_page_free(pfn_to_page(pfn + i));
- }
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
-}
-#endif /* CONFIG_PM */
-
static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
unsigned int order)
{
@@ -3052,7 +2863,8 @@ struct page *rmqueue(struct zone *preferred_zone,
out:
/* Separate test+clear to avoid unnecessary atomics */
- if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
+ if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))
+ && (alloc_flags & ALLOC_KSWAPD)) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
@@ -3061,80 +2873,6 @@ out:
return page;
}
-#ifdef CONFIG_FAIL_PAGE_ALLOC
-
-static struct {
- struct fault_attr attr;
-
- bool ignore_gfp_highmem;
- bool ignore_gfp_reclaim;
- u32 min_order;
-} fail_page_alloc = {
- .attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_reclaim = true,
- .ignore_gfp_highmem = true,
- .min_order = 1,
-};
-
-static int __init setup_fail_page_alloc(char *str)
-{
- return setup_fault_attr(&fail_page_alloc.attr, str);
-}
-__setup("fail_page_alloc=", setup_fail_page_alloc);
-
-static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
-{
- int flags = 0;
-
- if (order < fail_page_alloc.min_order)
- return false;
- if (gfp_mask & __GFP_NOFAIL)
- return false;
- if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
- return false;
- if (fail_page_alloc.ignore_gfp_reclaim &&
- (gfp_mask & __GFP_DIRECT_RECLAIM))
- return false;
-
- /* See comment in __should_failslab() */
- if (gfp_mask & __GFP_NOWARN)
- flags |= FAULT_NOWARN;
-
- return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
-}
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init fail_page_alloc_debugfs(void)
-{
- umode_t mode = S_IFREG | 0600;
- struct dentry *dir;
-
- dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
- &fail_page_alloc.attr);
-
- debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim);
- debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem);
- debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
-
- return 0;
-}
-
-late_initcall(fail_page_alloc_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else /* CONFIG_FAIL_PAGE_ALLOC */
-
-static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
-{
- return false;
-}
-
-#endif /* CONFIG_FAIL_PAGE_ALLOC */
-
noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
@@ -5137,383 +4875,6 @@ unsigned long nr_free_buffer_pages(void)
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-static inline void show_node(struct zone *zone)
-{
- if (IS_ENABLED(CONFIG_NUMA))
- printk("Node %d ", zone_to_nid(zone));
-}
-
-long si_mem_available(void)
-{
- long available;
- unsigned long pagecache;
- unsigned long wmark_low = 0;
- unsigned long pages[NR_LRU_LISTS];
- unsigned long reclaimable;
- struct zone *zone;
- int lru;
-
- for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
-
- for_each_zone(zone)
- wmark_low += low_wmark_pages(zone);
-
- /*
- * Estimate the amount of memory available for userspace allocations,
- * without causing swapping or OOM.
- */
- available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
-
- /*
- * Not all the page cache can be freed, otherwise the system will
- * start swapping or thrashing. Assume at least half of the page
- * cache, or the low watermark worth of cache, needs to stay.
- */
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
- pagecache -= min(pagecache / 2, wmark_low);
- available += pagecache;
-
- /*
- * Part of the reclaimable slab and other kernel memory consists of
- * items that are in use, and cannot be freed. Cap this estimate at the
- * low watermark.
- */
- reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
- available += reclaimable - min(reclaimable / 2, wmark_low);
-
- if (available < 0)
- available = 0;
- return available;
-}
-EXPORT_SYMBOL_GPL(si_mem_available);
-
-void si_meminfo(struct sysinfo *val)
-{
- val->totalram = totalram_pages();
- val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_zone_page_state(NR_FREE_PAGES);
- val->bufferram = nr_blockdev_pages();
- val->totalhigh = totalhigh_pages();
- val->freehigh = nr_free_highpages();
- val->mem_unit = PAGE_SIZE;
-}
-
-EXPORT_SYMBOL(si_meminfo);
-
-#ifdef CONFIG_NUMA
-void si_meminfo_node(struct sysinfo *val, int nid)
-{
- int zone_type; /* needs to be signed */
- unsigned long managed_pages = 0;
- unsigned long managed_highpages = 0;
- unsigned long free_highpages = 0;
- pg_data_t *pgdat = NODE_DATA(nid);
-
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
- struct zone *zone = &pgdat->node_zones[zone_type];
-
- if (is_highmem(zone)) {
- managed_highpages += zone_managed_pages(zone);
- free_highpages += zone_page_state(zone, NR_FREE_PAGES);
- }
- }
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
- val->mem_unit = PAGE_SIZE;
-}
-#endif
-
-/*
- * Determine whether the node should be displayed or not, depending on whether
- * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
- */
-static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
-{
- if (!(flags & SHOW_MEM_FILTER_NODES))
- return false;
-
- /*
- * no node mask - aka implicit memory numa policy. Do not bother with
- * the synchronization - read_mems_allowed_begin - because we do not
- * have to be precise here.
- */
- if (!nodemask)
- nodemask = &cpuset_current_mems_allowed;
-
- return !node_isset(nid, *nodemask);
-}
-
-static void show_migration_types(unsigned char type)
-{
- static const char types[MIGRATE_TYPES] = {
- [MIGRATE_UNMOVABLE] = 'U',
- [MIGRATE_MOVABLE] = 'M',
- [MIGRATE_RECLAIMABLE] = 'E',
- [MIGRATE_HIGHATOMIC] = 'H',
-#ifdef CONFIG_CMA
- [MIGRATE_CMA] = 'C',
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = 'I',
-#endif
- };
- char tmp[MIGRATE_TYPES + 1];
- char *p = tmp;
- int i;
-
- for (i = 0; i < MIGRATE_TYPES; i++) {
- if (type & (1 << i))
- *p++ = types[i];
- }
-
- *p = '\0';
- printk(KERN_CONT "(%s) ", tmp);
-}
-
-static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
-{
- int zone_idx;
- for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
- if (zone_managed_pages(pgdat->node_zones + zone_idx))
- return true;
- return false;
-}
-
-/*
- * Show free area list (used inside shift_scroll-lock stuff)
- * We also calculate the percentage fragmentation. We do this by counting the
- * memory on each free list with the exception of the first item on the list.
- *
- * Bits in @filter:
- * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
- * cpuset.
- */
-void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
-{
- unsigned long free_pcp = 0;
- int cpu, nid;
- struct zone *zone;
- pg_data_t *pgdat;
-
- for_each_populated_zone(zone) {
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
-
- for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
- }
-
- printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
- " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu dirty:%lu writeback:%lu\n"
- " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu\n"
- " sec_pagetables:%lu bounce:%lu\n"
- " kernel_misc_reclaimable:%lu\n"
- " free:%lu free_pcp:%lu free_cma:%lu\n",
- global_node_page_state(NR_ACTIVE_ANON),
- global_node_page_state(NR_INACTIVE_ANON),
- global_node_page_state(NR_ISOLATED_ANON),
- global_node_page_state(NR_ACTIVE_FILE),
- global_node_page_state(NR_INACTIVE_FILE),
- global_node_page_state(NR_ISOLATED_FILE),
- global_node_page_state(NR_UNEVICTABLE),
- global_node_page_state(NR_FILE_DIRTY),
- global_node_page_state(NR_WRITEBACK),
- global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
- global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
- global_node_page_state(NR_FILE_MAPPED),
- global_node_page_state(NR_SHMEM),
- global_node_page_state(NR_PAGETABLE),
- global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
- global_zone_page_state(NR_FREE_PAGES),
- free_pcp,
- global_zone_page_state(NR_FREE_CMA_PAGES));
-
- for_each_online_pgdat(pgdat) {
- if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
- continue;
- if (!node_has_managed_zones(pgdat, max_zone_idx))
- continue;
-
- printk("Node %d"
- " active_anon:%lukB"
- " inactive_anon:%lukB"
- " active_file:%lukB"
- " inactive_file:%lukB"
- " unevictable:%lukB"
- " isolated(anon):%lukB"
- " isolated(file):%lukB"
- " mapped:%lukB"
- " dirty:%lukB"
- " writeback:%lukB"
- " shmem:%lukB"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " shmem_thp: %lukB"
- " shmem_pmdmapped: %lukB"
- " anon_thp: %lukB"
-#endif
- " writeback_tmp:%lukB"
- " kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
- " shadow_call_stack:%lukB"
-#endif
- " pagetables:%lukB"
- " sec_pagetables:%lukB"
- " all_unreclaimable? %s"
- "\n",
- pgdat->node_id,
- K(node_page_state(pgdat, NR_ACTIVE_ANON)),
- K(node_page_state(pgdat, NR_INACTIVE_ANON)),
- K(node_page_state(pgdat, NR_ACTIVE_FILE)),
- K(node_page_state(pgdat, NR_INACTIVE_FILE)),
- K(node_page_state(pgdat, NR_UNEVICTABLE)),
- K(node_page_state(pgdat, NR_ISOLATED_ANON)),
- K(node_page_state(pgdat, NR_ISOLATED_FILE)),
- K(node_page_state(pgdat, NR_FILE_MAPPED)),
- K(node_page_state(pgdat, NR_FILE_DIRTY)),
- K(node_page_state(pgdat, NR_WRITEBACK)),
- K(node_page_state(pgdat, NR_SHMEM)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- K(node_page_state(pgdat, NR_SHMEM_THPS)),
- K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
- K(node_page_state(pgdat, NR_ANON_THPS)),
-#endif
- K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- node_page_state(pgdat, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
- node_page_state(pgdat, NR_KERNEL_SCS_KB),
-#endif
- K(node_page_state(pgdat, NR_PAGETABLE)),
- K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
- "yes" : "no");
- }
-
- for_each_populated_zone(zone) {
- int i;
-
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
-
- free_pcp = 0;
- for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
-
- show_node(zone);
- printk(KERN_CONT
- "%s"
- " free:%lukB"
- " boost:%lukB"
- " min:%lukB"
- " low:%lukB"
- " high:%lukB"
- " reserved_highatomic:%luKB"
- " active_anon:%lukB"
- " inactive_anon:%lukB"
- " active_file:%lukB"
- " inactive_file:%lukB"
- " unevictable:%lukB"
- " writepending:%lukB"
- " present:%lukB"
- " managed:%lukB"
- " mlocked:%lukB"
- " bounce:%lukB"
- " free_pcp:%lukB"
- " local_pcp:%ukB"
- " free_cma:%lukB"
- "\n",
- zone->name,
- K(zone_page_state(zone, NR_FREE_PAGES)),
- K(zone->watermark_boost),
- K(min_wmark_pages(zone)),
- K(low_wmark_pages(zone)),
- K(high_wmark_pages(zone)),
- K(zone->nr_reserved_highatomic),
- K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
- K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
- K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
- K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
- K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
- K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
- K(zone->present_pages),
- K(zone_managed_pages(zone)),
- K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
- K(free_pcp),
- K(this_cpu_read(zone->per_cpu_pageset->count)),
- K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
- printk("lowmem_reserve[]:");
- for (i = 0; i < MAX_NR_ZONES; i++)
- printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
- printk(KERN_CONT "\n");
- }
-
- for_each_populated_zone(zone) {
- unsigned int order;
- unsigned long nr[MAX_ORDER + 1], flags, total = 0;
- unsigned char types[MAX_ORDER + 1];
-
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
- show_node(zone);
- printk(KERN_CONT "%s: ", zone->name);
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order <= MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[order];
- int type;
-
- nr[order] = area->nr_free;
- total += nr[order] << order;
-
- types[order] = 0;
- for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!free_area_empty(area, type))
- types[order] |= 1 << type;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- for (order = 0; order <= MAX_ORDER; order++) {
- printk(KERN_CONT "%lu*%lukB ",
- nr[order], K(1UL) << order);
- if (nr[order])
- show_migration_types(types[order]);
- }
- printk(KERN_CONT "= %lukB\n", K(total));
- }
-
- for_each_online_node(nid) {
- if (show_mem_node_skip(filter, nid, nodemask))
- continue;
- hugetlb_show_meminfo_node(nid);
- }
-
- printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
-
- show_swap_cache_info();
-}
-
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
@@ -5560,12 +4921,12 @@ static int __parse_numa_zonelist_order(char *s)
return 0;
}
-char numa_zonelist_order[] = "Node";
-
+static char numa_zonelist_order[] = "Node";
+#define NUMA_ZONELIST_ORDER_LEN 16
/*
* sysctl handler for numa_zonelist_order
*/
-int numa_zonelist_order_handler(struct ctl_table *table, int write,
+static int numa_zonelist_order_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
if (write)
@@ -5573,7 +4934,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
return proc_dostring(table, write, buffer, length, ppos);
}
-
static int node_load[MAX_NUMNODES];
/**
@@ -5976,6 +5336,7 @@ static int zone_batchsize(struct zone *zone)
#endif
}
+static int percpu_pagelist_high_fraction;
static int zone_highsize(struct zone *zone, int batch, int cpu_online)
{
#ifdef CONFIG_MMU
@@ -6505,7 +5866,7 @@ postcore_initcall(init_per_zone_wmark_min)
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
-int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6521,7 +5882,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6551,7 +5912,7 @@ static void setup_min_unmapped_ratio(void)
}
-int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6578,7 +5939,7 @@ static void setup_min_slab_ratio(void)
sysctl_min_slab_ratio) / 100;
}
-int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6602,8 +5963,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
+static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *length, loff_t *ppos)
{
int i;
@@ -6623,7 +5984,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
* cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
-int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
int write, void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
@@ -6656,9 +6017,83 @@ out:
return ret;
}
+static struct ctl_table page_alloc_sysctl_table[] = {
+ {
+ .procname = "min_free_kbytes",
+ .data = &min_free_kbytes,
+ .maxlen = sizeof(min_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "watermark_boost_factor",
+ .data = &watermark_boost_factor,
+ .maxlen = sizeof(watermark_boost_factor),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "watermark_scale_factor",
+ .data = &watermark_scale_factor,
+ .maxlen = sizeof(watermark_scale_factor),
+ .mode = 0644,
+ .proc_handler = watermark_scale_factor_sysctl_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_THREE_THOUSAND,
+ },
+ {
+ .procname = "percpu_pagelist_high_fraction",
+ .data = &percpu_pagelist_high_fraction,
+ .maxlen = sizeof(percpu_pagelist_high_fraction),
+ .mode = 0644,
+ .proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "lowmem_reserve_ratio",
+ .data = &sysctl_lowmem_reserve_ratio,
+ .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
+ .mode = 0644,
+ .proc_handler = lowmem_reserve_ratio_sysctl_handler,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_zonelist_order",
+ .data = &numa_zonelist_order,
+ .maxlen = NUMA_ZONELIST_ORDER_LEN,
+ .mode = 0644,
+ .proc_handler = numa_zonelist_order_handler,
+ },
+ {
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "min_slab_ratio",
+ .data = &sysctl_min_slab_ratio,
+ .maxlen = sizeof(sysctl_min_slab_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+#endif
+ {}
+};
+
+void __init page_alloc_sysctl_init(void)
+{
+ register_sysctl_init("vm", page_alloc_sysctl_table);
+}
+
#ifdef CONFIG_CONTIG_ALLOC
-#if defined(CONFIG_DYNAMIC_DEBUG) || \
- (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
{
@@ -6672,11 +6107,6 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
dump_page(page, "migration failure");
}
}
-#else
-static inline void alloc_contig_dump_pages(struct list_head *page_list)
-{
-}
-#endif
/* [start, end) must belong to a single zone. */
int __alloc_contig_migrate_range(struct compact_control *cc,
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 31169b3e7f06..28c519fc9372 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -139,6 +139,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
int i;
struct page_ext *page_ext;
depot_stack_handle_t handle;
+ depot_stack_handle_t alloc_handle;
struct page_owner *page_owner;
u64 free_ts_nsec = local_clock();
@@ -146,6 +147,9 @@ void __reset_page_owner(struct page *page, unsigned short order)
if (unlikely(!page_ext))
return;
+ page_owner = get_page_owner(page_ext);
+ alloc_handle = page_owner->handle;
+
handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
for (i = 0; i < (1 << order); i++) {
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
@@ -155,6 +159,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_ext = page_ext_next(page_ext);
}
page_ext_put(page_ext);
+ stack_depot_dec_count(alloc_handle);
}
static inline void __set_page_owner_handle(struct page_ext *page_ext,
@@ -196,6 +201,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
return;
__set_page_owner_handle(page_ext, handle, order, gfp_mask);
page_ext_put(page_ext);
+ stack_depot_inc_count(handle);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -713,6 +719,47 @@ static const struct file_operations proc_page_owner_operations = {
.llseek = lseek_page_owner,
};
+static void stack_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations page_owner_stack_op = {
+ .start = stack_start,
+ .next = stack_next,
+ .stop = stack_stop,
+ .show = stack_print
+};
+
+static int page_owner_stack_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &page_owner_stack_op,
+ sizeof(unsigned long));
+}
+
+const struct file_operations page_owner_stack_operations = {
+ .open = page_owner_stack_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+unsigned long page_owner_stack_threshold;
+
+int page_owner_threshold_get(void *data, u64 *val)
+{
+ *val = page_owner_stack_threshold;
+ return 0;
+}
+
+int page_owner_threshold_set(void *data, u64 val)
+{
+ page_owner_stack_threshold = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get,
+ &page_owner_threshold_set, "%llu");
+
static int __init pageowner_init(void)
{
if (!static_branch_unlikely(&page_owner_inited)) {
@@ -723,6 +770,13 @@ static int __init pageowner_init(void)
debugfs_create_file("page_owner", 0400, NULL, NULL,
&proc_page_owner_operations);
+ debugfs_create_file("page_owner_stacks", 0400, NULL, NULL,
+ &page_owner_stack_operations);
+ debugfs_create_file("page_owner_threshold", 0600, NULL, NULL,
+ &proc_page_owner_threshold);
+
+ page_owner_stack_threshold = 0;
+
return 0;
}
late_initcall(pageowner_init)
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 78dfaf9e8990..0523edab03a6 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -104,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
mmap_read_lock(mm);
pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
- NULL, &locked);
+ &locked);
if (locked)
mmap_read_unlock(mm);
if (pinned_pages <= 0)
diff --git a/mm/rmap.c b/mm/rmap.c
index b42fc0389c24..ae127f60a4fb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2328,7 +2328,7 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
npages = get_user_pages_remote(mm, start, npages,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL, NULL);
+ pages, NULL);
if (npages < 0)
return npages;
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 0b502625cd30..974b32ba8b9d 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -35,7 +35,7 @@
#define SECRETMEM_MODE_MASK (0x0)
#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
-static bool secretmem_enable __ro_after_init;
+static bool secretmem_enable __ro_after_init = 1;
module_param_named(enable, secretmem_enable, bool, 0400);
MODULE_PARM_DESC(secretmem_enable,
"Enable secretmem and memfd_secret(2) system call");
diff --git a/mm/show_mem.c b/mm/show_mem.c
new file mode 100644
index 000000000000..01f8e9905817
--- /dev/null
+++ b/mm/show_mem.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic show_mem() implementation
+ *
+ * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/cma.h>
+#include <linux/cpuset.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+
+#include "internal.h"
+#include "swap.h"
+
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
+unsigned long totalreserve_pages __read_mostly;
+unsigned long totalcma_pages __read_mostly;
+
+static inline void show_node(struct zone *zone)
+{
+ if (IS_ENABLED(CONFIG_NUMA))
+ printk("Node %d ", zone_to_nid(zone));
+}
+
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += low_wmark_pages(zone);
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping or OOM.
+ */
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping or thrashing. Assume at least half of the page
+ * cache, or the low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
+ */
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
+void si_meminfo(struct sysinfo *val)
+{
+ val->totalram = totalram_pages();
+ val->sharedram = global_node_page_state(NR_SHMEM);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
+ val->bufferram = nr_blockdev_pages();
+ val->totalhigh = totalhigh_pages();
+ val->freehigh = nr_free_highpages();
+ val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+ int zone_type; /* needs to be signed */
+ unsigned long managed_pages = 0;
+ unsigned long managed_highpages = 0;
+ unsigned long free_highpages = 0;
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
+#ifdef CONFIG_HIGHMEM
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+
+ if (is_highmem(zone)) {
+ managed_highpages += zone_managed_pages(zone);
+ free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ }
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#else
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#endif
+ val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
+{
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ return false;
+
+ /*
+ * no node mask - aka implicit memory numa policy. Do not bother with
+ * the synchronization - read_mems_allowed_begin - because we do not
+ * have to be precise here.
+ */
+ if (!nodemask)
+ nodemask = &cpuset_current_mems_allowed;
+
+ return !node_isset(nid, *nodemask);
+}
+
+static void show_migration_types(unsigned char type)
+{
+ static const char types[MIGRATE_TYPES] = {
+ [MIGRATE_UNMOVABLE] = 'U',
+ [MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_HIGHATOMIC] = 'H',
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = 'C',
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ [MIGRATE_ISOLATE] = 'I',
+#endif
+ };
+ char tmp[MIGRATE_TYPES + 1];
+ char *p = tmp;
+ int i;
+
+ for (i = 0; i < MIGRATE_TYPES; i++) {
+ if (type & (1 << i))
+ *p++ = types[i];
+ }
+
+ *p = '\0';
+ printk(KERN_CONT "(%s) ", tmp);
+}
+
+static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
+{
+ int zone_idx;
+ for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
+ if (zone_managed_pages(pgdat->node_zones + zone_idx))
+ return true;
+ return false;
+}
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
+ */
+void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+{
+ unsigned long free_pcp = 0;
+ int cpu, nid;
+ struct zone *zone;
+ pg_data_t *pgdat;
+
+ for_each_populated_zone(zone) {
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+ }
+
+ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
+ " kernel_misc_reclaimable:%lu\n"
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
+ global_node_page_state(NR_ACTIVE_ANON),
+ global_node_page_state(NR_INACTIVE_ANON),
+ global_node_page_state(NR_ISOLATED_ANON),
+ global_node_page_state(NR_ACTIVE_FILE),
+ global_node_page_state(NR_INACTIVE_FILE),
+ global_node_page_state(NR_ISOLATED_FILE),
+ global_node_page_state(NR_UNEVICTABLE),
+ global_node_page_state(NR_FILE_DIRTY),
+ global_node_page_state(NR_WRITEBACK),
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
+ global_node_page_state(NR_FILE_MAPPED),
+ global_node_page_state(NR_SHMEM),
+ global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
+ global_zone_page_state(NR_FREE_PAGES),
+ free_pcp,
+ global_zone_page_state(NR_FREE_CMA_PAGES));
+
+ for_each_online_pgdat(pgdat) {
+ if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+ continue;
+ if (!node_has_managed_zones(pgdat, max_zone_idx))
+ continue;
+
+ printk("Node %d"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
+ " mapped:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
+ " writeback_tmp:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
+ " pagetables:%lukB"
+ " sec_pagetables:%lukB"
+ " all_unreclaimable? %s"
+ "\n",
+ pgdat->node_id,
+ K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_UNEVICTABLE)),
+ K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+ K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+ K(node_page_state(pgdat, NR_FILE_MAPPED)),
+ K(node_page_state(pgdat, NR_FILE_DIRTY)),
+ K(node_page_state(pgdat, NR_WRITEBACK)),
+ K(node_page_state(pgdat, NR_SHMEM)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(node_page_state(pgdat, NR_SHMEM_THPS)),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
+ K(node_page_state(pgdat, NR_ANON_THPS)),
+#endif
+ K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
+ K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+ "yes" : "no");
+ }
+
+ for_each_populated_zone(zone) {
+ int i;
+
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+
+ show_node(zone);
+ printk(KERN_CONT
+ "%s"
+ " free:%lukB"
+ " boost:%lukB"
+ " min:%lukB"
+ " low:%lukB"
+ " high:%lukB"
+ " reserved_highatomic:%luKB"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " writepending:%lukB"
+ " present:%lukB"
+ " managed:%lukB"
+ " mlocked:%lukB"
+ " bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
+ " free_cma:%lukB"
+ "\n",
+ zone->name,
+ K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(zone->watermark_boost),
+ K(min_wmark_pages(zone)),
+ K(low_wmark_pages(zone)),
+ K(high_wmark_pages(zone)),
+ K(zone->nr_reserved_highatomic),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
+ K(zone->present_pages),
+ K(zone_managed_pages(zone)),
+ K(zone_page_state(zone, NR_MLOCK)),
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->per_cpu_pageset->count)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
+ printk(KERN_CONT "\n");
+ }
+
+ for_each_populated_zone(zone) {
+ unsigned int order;
+ unsigned long nr[MAX_ORDER + 1], flags, total = 0;
+ unsigned char types[MAX_ORDER + 1];
+
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+ show_node(zone);
+ printk(KERN_CONT "%s: ", zone->name);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order <= MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[order];
+ int type;
+
+ nr[order] = area->nr_free;
+ total += nr[order] << order;
+
+ types[order] = 0;
+ for (type = 0; type < MIGRATE_TYPES; type++) {
+ if (!free_area_empty(area, type))
+ types[order] |= 1 << type;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ for (order = 0; order <= MAX_ORDER; order++) {
+ printk(KERN_CONT "%lu*%lukB ",
+ nr[order], K(1UL) << order);
+ if (nr[order])
+ show_migration_types(types[order]);
+ }
+ printk(KERN_CONT "= %lukB\n", K(total));
+ }
+
+ for_each_online_node(nid) {
+ if (show_mem_node_skip(filter, nid, nodemask))
+ continue;
+ hugetlb_show_meminfo_node(nid);
+ }
+
+ printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
+
+ show_swap_cache_info();
+}
+
+void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+{
+ unsigned long total = 0, reserved = 0, highmem = 0;
+ struct zone *zone;
+
+ printk("Mem-Info:\n");
+ __show_free_areas(filter, nodemask, max_zone_idx);
+
+ for_each_populated_zone(zone) {
+
+ total += zone->present_pages;
+ reserved += zone->present_pages - zone_managed_pages(zone);
+
+ if (is_highmem(zone))
+ highmem += zone->present_pages;
+ }
+
+ printk("%lu pages RAM\n", total);
+ printk("%lu pages HighMem/MovableOnly\n", highmem);
+ printk("%lu pages reserved\n", reserved);
+#ifdef CONFIG_CMA
+ printk("%lu pages cma reserved\n", totalcma_pages);
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
+#endif
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 274bbf797480..c74259001d5e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -41,6 +41,7 @@
#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
+#include <linux/suspend.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6d0cd2840cf0..15efbfbb1963 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2458,7 +2458,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
- if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
+ if (gfp_has_io_fs(sc->gfp_mask))
inactive >>= 3;
too_many = isolated > inactive;
diff --git a/mm/workingset.c b/mm/workingset.c
index 817758951886..90ae785d4c9c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -255,6 +255,29 @@ static void *lru_gen_eviction(struct folio *folio)
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
}
+/*
+ * Tests if the shadow entry is for a folio that was recently evicted.
+ * Fills in @memcgid, @pglist_data, @token, @workingset with the values
+ * unpacked from shadow.
+ */
+static bool lru_gen_test_recent(void *shadow, bool file, int *memcgid,
+ struct pglist_data **pgdat, unsigned long *token, bool *workingset)
+{
+ struct mem_cgroup *eviction_memcg;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ unsigned long min_seq;
+
+ unpack_shadow(shadow, memcgid, pgdat, token, workingset);
+ eviction_memcg = mem_cgroup_from_id(*memcgid);
+
+ lruvec = mem_cgroup_lruvec(eviction_memcg, *pgdat);
+ lrugen = &lruvec->lrugen;
+
+ min_seq = READ_ONCE(lrugen->min_seq[file]);
+ return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
+}
+
static void lru_gen_refault(struct folio *folio, void *shadow)
{
int hist, tier, refs;
@@ -269,23 +292,22 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
- unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
-
- if (pgdat != folio_pgdat(folio))
- return;
-
rcu_read_lock();
+ if (!lru_gen_test_recent(shadow, type, &memcg_id, &pgdat, &token,
+ &workingset))
+ goto unlock;
+
memcg = folio_memcg_rcu(folio);
if (memcg_id != mem_cgroup_id(memcg))
goto unlock;
+ if (pgdat != folio_pgdat(folio))
+ goto unlock;
+
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
-
min_seq = READ_ONCE(lrugen->min_seq[type]);
- if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
- goto unlock;
hist = lru_hist_from_seq(min_seq);
/* see the comment in folio_lru_refs() */
@@ -317,6 +339,12 @@ static void *lru_gen_eviction(struct folio *folio)
return NULL;
}
+static bool lru_gen_test_recent(void *shadow, bool file, int *memcgid,
+ struct pglist_data **pgdat, unsigned long *token, bool *workingset)
+{
+ return false;
+}
+
static void lru_gen_refault(struct folio *folio, void *shadow)
{
}
@@ -385,42 +413,34 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
}
/**
- * workingset_refault - Evaluate the refault of a previously evicted folio.
- * @folio: The freshly allocated replacement folio.
- * @shadow: Shadow entry of the evicted folio.
- *
- * Calculates and evaluates the refault distance of the previously
- * evicted folio in the context of the node and the memcg whose memory
- * pressure caused the eviction.
+ * workingset_test_recent - tests if the shadow entry is for a folio that was
+ * recently evicted. Also fills in @workingset with the value unpacked from
+ * shadow.
+ * @shadow: the shadow entry to be tested.
+ * @file: whether the corresponding folio is from the file lru.
+ * @workingset: where the workingset value unpacked from shadow should
+ * be stored.
+ *
+ * Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
-void workingset_refault(struct folio *folio, void *shadow)
+bool workingset_test_recent(void *shadow, bool file, bool *workingset)
{
- bool file = folio_is_file_lru(folio);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
unsigned long workingset_size;
- struct pglist_data *pgdat;
- struct mem_cgroup *memcg;
- unsigned long eviction;
- struct lruvec *lruvec;
unsigned long refault;
- bool workingset;
int memcgid;
- long nr;
+ struct pglist_data *pgdat;
+ unsigned long eviction;
- if (lru_gen_enabled()) {
- lru_gen_refault(folio, shadow);
- return;
- }
+ if (lru_gen_enabled())
+ return lru_gen_test_recent(shadow, file, &memcgid, &pgdat, &eviction,
+ workingset);
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
- /* Flush stats (and potentially sleep) before holding RCU read lock */
- mem_cgroup_flush_stats_ratelimited();
-
- rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
@@ -439,7 +459,8 @@ void workingset_refault(struct folio *folio, void *shadow)
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !eviction_memcg)
- goto out;
+ return false;
+
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -462,20 +483,6 @@ void workingset_refault(struct folio *folio, void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
/*
- * The activation decision for this folio is made at the level
- * where the eviction occurred, as that is where the LRU order
- * during folio reclaim is being determined.
- *
- * However, the cgroup that will own the folio is the one that
- * is actually experiencing the refault event.
- */
- nr = folio_nr_pages(folio);
- memcg = folio_memcg(folio);
- pgdat = folio_pgdat(folio);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
-
- mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
- /*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if
* all the memory was available to the workingset. Whether
@@ -495,7 +502,54 @@ void workingset_refault(struct folio *folio, void *shadow)
NR_INACTIVE_ANON);
}
}
- if (refault_distance > workingset_size)
+
+ return refault_distance <= workingset_size;
+}
+
+/**
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted folio in the context of the node and the memcg whose memory
+ * pressure caused the eviction.
+ */
+void workingset_refault(struct folio *folio, void *shadow)
+{
+ bool file = folio_is_file_lru(folio);
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ bool workingset;
+ long nr;
+
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ return;
+ }
+
+ /* Flush stats (and potentially sleep) before holding RCU read lock */
+ mem_cgroup_flush_stats_ratelimited();
+
+ rcu_read_lock();
+
+ /*
+ * The activation decision for this folio is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during folio reclaim is being determined.
+ *
+ * However, the cgroup that will own the folio is the one that
+ * is actually experiencing the refault event.
+ */
+ nr = folio_nr_pages(folio);
+ memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+
+ if (!workingset_test_recent(shadow, file, &workingset))
goto out;
folio_set_active(folio);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 02f7f414aade..c0d433541636 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1341,7 +1341,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
spin_unlock(&pool->lock);
class = zspage_class(pool, zspage);
- off = (class->size * obj_idx) & ~PAGE_MASK;
+ off = offset_in_page(class->size * obj_idx);
local_lock(&zs_map_area.lock);
area = this_cpu_ptr(&zs_map_area);
@@ -1381,7 +1381,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
class = zspage_class(pool, zspage);
- off = (class->size * obj_idx) & ~PAGE_MASK;
+ off = offset_in_page(class->size * obj_idx);
area = this_cpu_ptr(&zs_map_area);
if (off + class->size <= PAGE_SIZE)
@@ -1438,7 +1438,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
offset = obj * class->size;
nr_page = offset >> PAGE_SHIFT;
- m_offset = offset & ~PAGE_MASK;
+ m_offset = offset_in_page(offset);
m_page = get_first_page(zspage);
for (i = 0; i < nr_page; i++)
@@ -1548,7 +1548,7 @@ static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
void *vaddr;
obj_to_location(obj, &f_page, &f_objidx);
- f_offset = (class_size * f_objidx) & ~PAGE_MASK;
+ f_offset = offset_in_page(class_size * f_objidx);
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
@@ -1640,8 +1640,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
obj_to_location(src, &s_page, &s_objidx);
obj_to_location(dst, &d_page, &d_objidx);
- s_off = (class->size * s_objidx) & ~PAGE_MASK;
- d_off = (class->size * d_objidx) & ~PAGE_MASK;
+ s_off = offset_in_page(class->size * s_objidx);
+ d_off = offset_in_page(class->size * d_objidx);
if (s_off + class->size > PAGE_SIZE)
s_size = PAGE_SIZE - s_off;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 02207e852d79..06cead2b8e34 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -103,7 +103,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
mmap_read_lock(current->mm);
npgs = pin_user_pages(address, umem->npgs,
- gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
+ gup_flags | FOLL_LONGTERM, &umem->pgs[0]);
mmap_read_unlock(current->mm);
if (npgs != umem->npgs) {
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index f8bd6178d17b..fc7ba95e86a0 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -155,6 +155,7 @@ aquired||acquired
aquisition||acquisition
arbitary||arbitrary
architechture||architecture
+archtecture||architecture
arguement||argument
arguements||arguments
arithmatic||arithmetic
@@ -279,6 +280,7 @@ cant'||can't
canot||cannot
cann't||can't
cannnot||cannot
+capabiity||capability
capabilites||capabilities
capabilties||capabilities
capabilty||capability
@@ -426,6 +428,7 @@ cotrol||control
cound||could
couter||counter
coutner||counter
+creationg||creating
cryptocraphic||cryptographic
cummulative||cumulative
cunter||counter
@@ -492,6 +495,7 @@ destorys||destroys
destroied||destroyed
detabase||database
deteced||detected
+detecion||detection
detectt||detect
detroyed||destroyed
develope||develop
@@ -513,6 +517,7 @@ diferent||different
differrence||difference
diffrent||different
differenciate||differentiate
+diffreential||differential
diffrentiate||differentiate
difinition||definition
digial||digital
@@ -617,6 +622,7 @@ evalute||evaluate
evalutes||evaluates
evalution||evaluation
excecutable||executable
+excceed||exceed
exceded||exceeded
exceds||exceeds
exceeed||exceed
@@ -632,6 +638,7 @@ existant||existent
exixt||exist
exsits||exists
exlcude||exclude
+exlcuding||excluding
exlcusive||exclusive
exlusive||exclusive
exmaple||example
@@ -726,6 +733,8 @@ generiously||generously
genereate||generate
genereted||generated
genric||generic
+gerenal||general
+geting||getting
globel||global
grabing||grabbing
grahical||graphical
@@ -899,6 +908,7 @@ iteraions||iterations
iternations||iterations
itertation||iteration
itslef||itself
+ivalid||invalid
jave||java
jeffies||jiffies
jumpimng||jumping
@@ -977,6 +987,7 @@ microprocesspr||microprocessor
migrateable||migratable
millenium||millennium
milliseonds||milliseconds
+minimim||minimum
minium||minimum
minimam||minimum
minimun||minimum
@@ -1042,6 +1053,7 @@ notifed||notified
notity||notify
nubmer||number
numebr||number
+numer||number
numner||number
nunber||number
obtaion||obtain
@@ -1061,6 +1073,7 @@ offet||offset
offlaod||offload
offloded||offloaded
offseting||offsetting
+oflload||offload
omited||omitted
omiting||omitting
omitt||omit
@@ -1105,6 +1118,7 @@ pakage||package
paket||packet
pallette||palette
paln||plan
+palne||plane
paramameters||parameters
paramaters||parameters
paramater||parameter
@@ -1181,12 +1195,14 @@ previsously||previously
primative||primitive
princliple||principle
priorty||priority
+priting||printing
privilaged||privileged
privilage||privilege
priviledge||privilege
priviledges||privileges
privleges||privileges
probaly||probably
+probabalistic||probabilistic
procceed||proceed
proccesors||processors
procesed||processed
@@ -1460,6 +1476,7 @@ submited||submitted
submition||submission
succeded||succeeded
suceed||succeed
+succesfuly||successfully
succesfully||successfully
succesful||successful
successed||succeeded
@@ -1503,6 +1520,7 @@ symetric||symmetric
synax||syntax
synchonized||synchronized
sychronization||synchronization
+sychronously||synchronously
synchronuously||synchronously
syncronize||synchronize
syncronized||synchronized
@@ -1532,6 +1550,7 @@ threee||three
threshhold||threshold
thresold||threshold
throught||through
+tansition||transition
trackling||tracking
troughput||throughput
trys||tries
@@ -1611,6 +1630,7 @@ unneccessary||unnecessary
unnecesary||unnecessary
unneedingly||unnecessarily
unnsupported||unsupported
+unuspported||unsupported
unmached||unmatched
unprecise||imprecise
unpriviledged||unprivileged
@@ -1657,6 +1677,7 @@ verfication||verification
veriosn||version
verisons||versions
verison||version
+veritical||vertical
verson||version
vicefersa||vice-versa
virtal||virtual
@@ -1677,6 +1698,7 @@ whenver||whenever
wheter||whether
whe||when
wierd||weird
+wihout||without
wiil||will
wirte||write
withing||within
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 31af29f669d2..ac20c0bdff9d 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -916,7 +916,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
*/
mmap_read_lock(bprm->mm);
ret = get_user_pages_remote(bprm->mm, pos, 1,
- FOLL_FORCE, &page, NULL, NULL);
+ FOLL_FORCE, &page, NULL);
mmap_read_unlock(bprm->mm);
if (ret <= 0)
return false;
diff --git a/tools/testing/radix-tree/linux/init.h b/tools/testing/radix-tree/linux/init.h
index 1bb0afc21309..81563c3dfce7 100644
--- a/tools/testing/radix-tree/linux/init.h
+++ b/tools/testing/radix-tree/linux/init.h
@@ -1 +1,2 @@
#define __init
+#define __exit
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 9286d3baa12d..03539d86cdf0 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -14,6 +14,7 @@
#include "test.h"
#include <stdlib.h>
#include <time.h>
+#include "linux/init.h"
#define module_init(x)
#define module_exit(x)
@@ -22,7 +23,6 @@
#define dump_stack() assert(0)
#include "../../../lib/maple_tree.c"
-#undef CONFIG_DEBUG_MAPLE_TREE
#include "../../../lib/test_maple_tree.c"
#define RCU_RANGE_COUNT 1000
@@ -81,7 +81,7 @@ static void check_mas_alloc_node_count(struct ma_state *mas)
* check_new_node() - Check the creation of new nodes and error path
* verification.
*/
-static noinline void check_new_node(struct maple_tree *mt)
+static noinline void __init check_new_node(struct maple_tree *mt)
{
struct maple_node *mn, *mn2, *mn3;
@@ -455,7 +455,7 @@ static noinline void check_new_node(struct maple_tree *mt)
/*
* Check erasing including RCU.
*/
-static noinline void check_erase(struct maple_tree *mt, unsigned long index,
+static noinline void __init check_erase(struct maple_tree *mt, unsigned long index,
void *ptr)
{
MT_BUG_ON(mt, mtree_test_erase(mt, index) != ptr);
@@ -465,24 +465,24 @@ static noinline void check_erase(struct maple_tree *mt, unsigned long index,
#define erase_check_insert(mt, i) check_insert(mt, set[i], entry[i%2])
#define erase_check_erase(mt, i) check_erase(mt, set[i], entry[i%2])
-static noinline void check_erase_testset(struct maple_tree *mt)
+static noinline void __init check_erase_testset(struct maple_tree *mt)
{
- unsigned long set[] = { 5015, 5014, 5017, 25, 1000,
- 1001, 1002, 1003, 1005, 0,
- 6003, 6002, 6008, 6012, 6015,
- 7003, 7002, 7008, 7012, 7015,
- 8003, 8002, 8008, 8012, 8015,
- 9003, 9002, 9008, 9012, 9015,
- 10003, 10002, 10008, 10012, 10015,
- 11003, 11002, 11008, 11012, 11015,
- 12003, 12002, 12008, 12012, 12015,
- 13003, 13002, 13008, 13012, 13015,
- 14003, 14002, 14008, 14012, 14015,
- 15003, 15002, 15008, 15012, 15015,
- };
-
-
- void *ptr = &set;
+ static const unsigned long set[] = { 5015, 5014, 5017, 25, 1000,
+ 1001, 1002, 1003, 1005, 0,
+ 6003, 6002, 6008, 6012, 6015,
+ 7003, 7002, 7008, 7012, 7015,
+ 8003, 8002, 8008, 8012, 8015,
+ 9003, 9002, 9008, 9012, 9015,
+ 10003, 10002, 10008, 10012, 10015,
+ 11003, 11002, 11008, 11012, 11015,
+ 12003, 12002, 12008, 12012, 12015,
+ 13003, 13002, 13008, 13012, 13015,
+ 14003, 14002, 14008, 14012, 14015,
+ 15003, 15002, 15008, 15012, 15015,
+ };
+
+
+ void *ptr = &check_erase_testset;
void *entry[2] = { ptr, mt };
void *root_node;
@@ -739,7 +739,7 @@ static noinline void check_erase_testset(struct maple_tree *mt)
int mas_ce2_over_count(struct ma_state *mas_start, struct ma_state *mas_end,
void *s_entry, unsigned long s_min,
void *e_entry, unsigned long e_max,
- unsigned long *set, int i, bool null_entry)
+ const unsigned long *set, int i, bool null_entry)
{
int count = 0, span = 0;
unsigned long retry = 0;
@@ -969,8 +969,8 @@ retry:
}
#if defined(CONFIG_64BIT)
-static noinline void check_erase2_testset(struct maple_tree *mt,
- unsigned long *set, unsigned long size)
+static noinline void __init check_erase2_testset(struct maple_tree *mt,
+ const unsigned long *set, unsigned long size)
{
int entry_count = 0;
int check = 0;
@@ -1054,7 +1054,7 @@ static noinline void check_erase2_testset(struct maple_tree *mt,
if (entry_count)
MT_BUG_ON(mt, !mt_height(mt));
#if check_erase2_debug > 1
- mt_dump(mt);
+ mt_dump(mt, mt_dump_hex);
#endif
#if check_erase2_debug
pr_err("Done\n");
@@ -1085,7 +1085,7 @@ static noinline void check_erase2_testset(struct maple_tree *mt,
mas_for_each(&mas, foo, ULONG_MAX) {
if (xa_is_zero(foo)) {
if (addr == mas.index) {
- mt_dump(mas.tree);
+ mt_dump(mas.tree, mt_dump_hex);
pr_err("retry failed %lu - %lu\n",
mas.index, mas.last);
MT_BUG_ON(mt, 1);
@@ -1114,11 +1114,11 @@ static noinline void check_erase2_testset(struct maple_tree *mt,
/* These tests were pulled from KVM tree modifications which failed. */
-static noinline void check_erase2_sets(struct maple_tree *mt)
+static noinline void __init check_erase2_sets(struct maple_tree *mt)
{
void *entry;
unsigned long start = 0;
- unsigned long set[] = {
+ static const unsigned long set[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140721266458624, 140737488351231,
ERASE, 140721266458624, 140737488351231,
@@ -1136,7 +1136,7 @@ ERASE, 140253902692352, 140253902864383,
STORE, 140253902692352, 140253902696447,
STORE, 140253902696448, 140253902864383,
};
- unsigned long set2[] = {
+ static const unsigned long set2[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140735933583360, 140737488351231,
ERASE, 140735933583360, 140737488351231,
@@ -1160,7 +1160,7 @@ STORE, 140277094813696, 140277094821887,
STORE, 140277094821888, 140277094825983,
STORE, 140735933906944, 140735933911039,
};
- unsigned long set3[] = {
+ static const unsigned long set3[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140735790264320, 140737488351231,
ERASE, 140735790264320, 140737488351231,
@@ -1203,7 +1203,7 @@ STORE, 47135835840512, 47135835885567,
STORE, 47135835885568, 47135835893759,
};
- unsigned long set4[] = {
+ static const unsigned long set4[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140728251703296, 140737488351231,
ERASE, 140728251703296, 140737488351231,
@@ -1224,7 +1224,7 @@ ERASE, 47646523277312, 47646523445247,
STORE, 47646523277312, 47646523400191,
};
- unsigned long set5[] = {
+ static const unsigned long set5[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140726874062848, 140737488351231,
ERASE, 140726874062848, 140737488351231,
@@ -1357,7 +1357,7 @@ STORE, 47884791619584, 47884791623679,
STORE, 47884791623680, 47884791627775,
};
- unsigned long set6[] = {
+ static const unsigned long set6[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140722999021568, 140737488351231,
ERASE, 140722999021568, 140737488351231,
@@ -1489,7 +1489,7 @@ ERASE, 47430432014336, 47430432022527,
STORE, 47430432014336, 47430432018431,
STORE, 47430432018432, 47430432022527,
};
- unsigned long set7[] = {
+ static const unsigned long set7[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140729808330752, 140737488351231,
ERASE, 140729808330752, 140737488351231,
@@ -1621,7 +1621,7 @@ ERASE, 47439987130368, 47439987138559,
STORE, 47439987130368, 47439987134463,
STORE, 47439987134464, 47439987138559,
};
- unsigned long set8[] = {
+ static const unsigned long set8[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140722482974720, 140737488351231,
ERASE, 140722482974720, 140737488351231,
@@ -1754,7 +1754,7 @@ STORE, 47708488638464, 47708488642559,
STORE, 47708488642560, 47708488646655,
};
- unsigned long set9[] = {
+ static const unsigned long set9[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140736427839488, 140737488351231,
ERASE, 140736427839488, 140736427839488,
@@ -5620,7 +5620,7 @@ ERASE, 47906195480576, 47906195480576,
STORE, 94641242615808, 94641242750975,
};
- unsigned long set10[] = {
+ static const unsigned long set10[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140736427839488, 140737488351231,
ERASE, 140736427839488, 140736427839488,
@@ -9484,7 +9484,7 @@ STORE, 139726599680000, 139726599684095,
ERASE, 47906195480576, 47906195480576,
STORE, 94641242615808, 94641242750975,
};
- unsigned long set11[] = {
+ static const unsigned long set11[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140732658499584, 140737488351231,
ERASE, 140732658499584, 140732658499584,
@@ -9510,7 +9510,7 @@ STORE, 140732658565120, 140732658569215,
STORE, 140732658552832, 140732658565119,
};
- unsigned long set12[] = { /* contains 12 values. */
+ static const unsigned long set12[] = { /* contains 12 values. */
STORE, 140737488347136, 140737488351231,
STORE, 140732658499584, 140737488351231,
ERASE, 140732658499584, 140732658499584,
@@ -9537,7 +9537,7 @@ STORE, 140732658552832, 140732658565119,
STORE, 140014592741375, 140014592741375, /* contrived */
STORE, 140014592733184, 140014592741376, /* creates first entry retry. */
};
- unsigned long set13[] = {
+ static const unsigned long set13[] = {
STORE, 140373516247040, 140373516251135,/*: ffffa2e7b0e10d80 */
STORE, 140373516251136, 140373516255231,/*: ffffa2e7b1195d80 */
STORE, 140373516255232, 140373516443647,/*: ffffa2e7b0e109c0 */
@@ -9550,7 +9550,7 @@ STORE, 140373518684160, 140373518688254,/*: ffffa2e7b05fec00 */
STORE, 140373518688256, 140373518692351,/*: ffffa2e7bfbdcd80 */
STORE, 140373518692352, 140373518696447,/*: ffffa2e7b0749e40 */
};
- unsigned long set14[] = {
+ static const unsigned long set14[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140731667996672, 140737488351231,
SNULL, 140731668000767, 140737488351231,
@@ -9834,7 +9834,7 @@ SNULL, 139826136543232, 139826136809471,
STORE, 139826136809472, 139826136842239,
STORE, 139826136543232, 139826136809471,
};
- unsigned long set15[] = {
+ static const unsigned long set15[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140722061451264, 140737488351231,
SNULL, 140722061455359, 140737488351231,
@@ -10119,7 +10119,7 @@ STORE, 139906808958976, 139906808991743,
STORE, 139906808692736, 139906808958975,
};
- unsigned long set16[] = {
+ static const unsigned long set16[] = {
STORE, 94174808662016, 94174809321471,
STORE, 94174811414528, 94174811426815,
STORE, 94174811426816, 94174811430911,
@@ -10330,7 +10330,7 @@ STORE, 139921865613312, 139921865617407,
STORE, 139921865547776, 139921865564159,
};
- unsigned long set17[] = {
+ static const unsigned long set17[] = {
STORE, 94397057224704, 94397057646591,
STORE, 94397057650688, 94397057691647,
STORE, 94397057691648, 94397057695743,
@@ -10392,7 +10392,7 @@ STORE, 140720477511680, 140720477646847,
STORE, 140720478302208, 140720478314495,
STORE, 140720478314496, 140720478318591,
};
- unsigned long set18[] = {
+ static const unsigned long set18[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140724953673728, 140737488351231,
SNULL, 140724953677823, 140737488351231,
@@ -10425,7 +10425,7 @@ STORE, 140222970597376, 140222970605567,
ERASE, 140222970597376, 140222970605567,
STORE, 140222970597376, 140222970605567,
};
- unsigned long set19[] = {
+ static const unsigned long set19[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140725182459904, 140737488351231,
SNULL, 140725182463999, 140737488351231,
@@ -10694,7 +10694,7 @@ STORE, 140656836775936, 140656836780031,
STORE, 140656787476480, 140656791920639,
ERASE, 140656774639616, 140656779083775,
};
- unsigned long set20[] = {
+ static const unsigned long set20[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140735952392192, 140737488351231,
SNULL, 140735952396287, 140737488351231,
@@ -10850,7 +10850,7 @@ STORE, 140590386819072, 140590386823167,
STORE, 140590386823168, 140590386827263,
SNULL, 140590376591359, 140590376595455,
};
- unsigned long set21[] = {
+ static const unsigned long set21[] = {
STORE, 93874710941696, 93874711363583,
STORE, 93874711367680, 93874711408639,
STORE, 93874711408640, 93874711412735,
@@ -10920,7 +10920,7 @@ ERASE, 140708393312256, 140708393316351,
ERASE, 140708393308160, 140708393312255,
ERASE, 140708393291776, 140708393308159,
};
- unsigned long set22[] = {
+ static const unsigned long set22[] = {
STORE, 93951397134336, 93951397183487,
STORE, 93951397183488, 93951397728255,
STORE, 93951397728256, 93951397826559,
@@ -11047,7 +11047,7 @@ STORE, 140551361253376, 140551361519615,
ERASE, 140551361253376, 140551361519615,
};
- unsigned long set23[] = {
+ static const unsigned long set23[] = {
STORE, 94014447943680, 94014448156671,
STORE, 94014450253824, 94014450257919,
STORE, 94014450257920, 94014450266111,
@@ -14371,7 +14371,7 @@ SNULL, 140175956627455, 140175985139711,
STORE, 140175927242752, 140175956627455,
STORE, 140175956627456, 140175985139711,
};
- unsigned long set24[] = {
+ static const unsigned long set24[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140735281639424, 140737488351231,
SNULL, 140735281643519, 140737488351231,
@@ -15533,7 +15533,7 @@ ERASE, 139635393024000, 139635401412607,
ERASE, 139635384627200, 139635384631295,
ERASE, 139635384631296, 139635393019903,
};
- unsigned long set25[] = {
+ static const unsigned long set25[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140737488343040, 140737488351231,
STORE, 140722547441664, 140737488351231,
@@ -22321,7 +22321,7 @@ STORE, 140249652703232, 140249682087935,
STORE, 140249682087936, 140249710600191,
};
- unsigned long set26[] = {
+ static const unsigned long set26[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140729464770560, 140737488351231,
SNULL, 140729464774655, 140737488351231,
@@ -22345,7 +22345,7 @@ ERASE, 140109040951296, 140109040959487,
STORE, 140109040955392, 140109040959487,
ERASE, 140109040955392, 140109040959487,
};
- unsigned long set27[] = {
+ static const unsigned long set27[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140726128070656, 140737488351231,
SNULL, 140726128074751, 140737488351231,
@@ -22741,7 +22741,7 @@ STORE, 140415509696512, 140415535910911,
ERASE, 140415537422336, 140415562588159,
STORE, 140415482433536, 140415509696511,
};
- unsigned long set28[] = {
+ static const unsigned long set28[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140722475622400, 140737488351231,
SNULL, 140722475626495, 140737488351231,
@@ -22809,7 +22809,7 @@ STORE, 139918413348864, 139918413352959,
ERASE, 139918413316096, 139918413344767,
STORE, 93865848528896, 93865848664063,
};
- unsigned long set29[] = {
+ static const unsigned long set29[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140734467944448, 140737488351231,
SNULL, 140734467948543, 140737488351231,
@@ -23684,7 +23684,7 @@ ERASE, 140143079972864, 140143088361471,
ERASE, 140143205793792, 140143205797887,
ERASE, 140143205797888, 140143214186495,
};
- unsigned long set30[] = {
+ static const unsigned long set30[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140733436743680, 140737488351231,
SNULL, 140733436747775, 140737488351231,
@@ -24566,7 +24566,7 @@ ERASE, 140165225893888, 140165225897983,
ERASE, 140165225897984, 140165234286591,
ERASE, 140165058105344, 140165058109439,
};
- unsigned long set31[] = {
+ static const unsigned long set31[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140730890784768, 140737488351231,
SNULL, 140730890788863, 140737488351231,
@@ -25379,7 +25379,7 @@ ERASE, 140623906590720, 140623914979327,
ERASE, 140622950277120, 140622950281215,
ERASE, 140622950281216, 140622958669823,
};
- unsigned long set32[] = {
+ static const unsigned long set32[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140731244212224, 140737488351231,
SNULL, 140731244216319, 140737488351231,
@@ -26175,7 +26175,7 @@ ERASE, 140400417288192, 140400425676799,
ERASE, 140400283066368, 140400283070463,
ERASE, 140400283070464, 140400291459071,
};
- unsigned long set33[] = {
+ static const unsigned long set33[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140734562918400, 140737488351231,
SNULL, 140734562922495, 140737488351231,
@@ -26317,7 +26317,7 @@ STORE, 140582961786880, 140583003750399,
ERASE, 140582961786880, 140583003750399,
};
- unsigned long set34[] = {
+ static const unsigned long set34[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140731327180800, 140737488351231,
SNULL, 140731327184895, 140737488351231,
@@ -27198,7 +27198,7 @@ ERASE, 140012522094592, 140012530483199,
ERASE, 140012033142784, 140012033146879,
ERASE, 140012033146880, 140012041535487,
};
- unsigned long set35[] = {
+ static const unsigned long set35[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140730536939520, 140737488351231,
SNULL, 140730536943615, 140737488351231,
@@ -27955,7 +27955,7 @@ ERASE, 140474471936000, 140474480324607,
ERASE, 140474396430336, 140474396434431,
ERASE, 140474396434432, 140474404823039,
};
- unsigned long set36[] = {
+ static const unsigned long set36[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140723893125120, 140737488351231,
SNULL, 140723893129215, 140737488351231,
@@ -28816,7 +28816,7 @@ ERASE, 140121890357248, 140121898745855,
ERASE, 140121269587968, 140121269592063,
ERASE, 140121269592064, 140121277980671,
};
- unsigned long set37[] = {
+ static const unsigned long set37[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140722404016128, 140737488351231,
SNULL, 140722404020223, 140737488351231,
@@ -28942,7 +28942,7 @@ STORE, 139759821246464, 139759888355327,
ERASE, 139759821246464, 139759888355327,
ERASE, 139759888355328, 139759955464191,
};
- unsigned long set38[] = {
+ static const unsigned long set38[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140730666221568, 140737488351231,
SNULL, 140730666225663, 140737488351231,
@@ -29752,7 +29752,7 @@ ERASE, 140613504712704, 140613504716799,
ERASE, 140613504716800, 140613513105407,
};
- unsigned long set39[] = {
+ static const unsigned long set39[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140736271417344, 140737488351231,
SNULL, 140736271421439, 140737488351231,
@@ -30124,7 +30124,7 @@ STORE, 140325364428800, 140325372821503,
STORE, 140325356036096, 140325364428799,
SNULL, 140325364432895, 140325372821503,
};
- unsigned long set40[] = {
+ static const unsigned long set40[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140734309167104, 140737488351231,
SNULL, 140734309171199, 140737488351231,
@@ -30875,7 +30875,7 @@ ERASE, 140320289300480, 140320289304575,
ERASE, 140320289304576, 140320297693183,
ERASE, 140320163409920, 140320163414015,
};
- unsigned long set41[] = {
+ static const unsigned long set41[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140728157171712, 140737488351231,
SNULL, 140728157175807, 140737488351231,
@@ -31185,7 +31185,7 @@ STORE, 94376135090176, 94376135094271,
STORE, 94376135094272, 94376135098367,
SNULL, 94376135094272, 94377208836095,
};
- unsigned long set42[] = {
+ static const unsigned long set42[] = {
STORE, 314572800, 1388314623,
STORE, 1462157312, 1462169599,
STORE, 1462169600, 1462185983,
@@ -33862,7 +33862,7 @@ SNULL, 3798999040, 3799101439,
*/
};
- unsigned long set43[] = {
+ static const unsigned long set43[] = {
STORE, 140737488347136, 140737488351231,
STORE, 140734187720704, 140737488351231,
SNULL, 140734187724800, 140737488351231,
@@ -34513,7 +34513,7 @@ static void *rcu_reader_rev(void *ptr)
if (mas.index != r_start) {
alt = xa_mk_value(index + i * 2 + 1 +
RCU_RANGE_COUNT);
- mt_dump(test->mt);
+ mt_dump(test->mt, mt_dump_dec);
printk("Error: %lu-%lu %p != %lu-%lu %p %p line %d i %d\n",
mas.index, mas.last, entry,
r_start, r_end, expected, alt,
@@ -34996,7 +34996,7 @@ void run_check_rcu_slowread(struct maple_tree *mt, struct rcu_test_struct *vals)
MT_BUG_ON(mt, !vals->seen_entry3);
MT_BUG_ON(mt, !vals->seen_both);
}
-static noinline void check_rcu_simulated(struct maple_tree *mt)
+static noinline void __init check_rcu_simulated(struct maple_tree *mt)
{
unsigned long i, nr_entries = 1000;
unsigned long target = 4320;
@@ -35157,7 +35157,7 @@ static noinline void check_rcu_simulated(struct maple_tree *mt)
rcu_unregister_thread();
}
-static noinline void check_rcu_threaded(struct maple_tree *mt)
+static noinline void __init check_rcu_threaded(struct maple_tree *mt)
{
unsigned long i, nr_entries = 1000;
struct rcu_test_struct vals;
@@ -35259,6 +35259,7 @@ static void mas_dfs_preorder(struct ma_state *mas)
struct maple_enode *prev;
unsigned char end, slot = 0;
+ unsigned long *pivots;
if (mas->node == MAS_START) {
mas_start(mas);
@@ -35291,6 +35292,9 @@ walk_up:
mas_ascend(mas);
goto walk_up;
}
+ pivots = ma_pivots(mte_to_node(prev), mte_node_type(prev));
+ mas->max = mas_safe_pivot(mas, pivots, slot, mte_node_type(prev));
+ mas->min = mas_safe_min(mas, pivots, slot);
return;
done:
@@ -35366,7 +35370,7 @@ static void check_dfs_preorder(struct maple_tree *mt)
/* End of depth first search tests */
/* Preallocation testing */
-static noinline void check_prealloc(struct maple_tree *mt)
+static noinline void __init check_prealloc(struct maple_tree *mt)
{
unsigned long i, max = 100;
unsigned long allocated;
@@ -35494,7 +35498,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
/* End of preallocation testing */
/* Spanning writes, writes that span nodes and layers of the tree */
-static noinline void check_spanning_write(struct maple_tree *mt)
+static noinline void __init check_spanning_write(struct maple_tree *mt)
{
unsigned long i, max = 5000;
MA_STATE(mas, mt, 1200, 2380);
@@ -35662,7 +35666,7 @@ static noinline void check_spanning_write(struct maple_tree *mt)
/* End of spanning write testing */
/* Writes to a NULL area that are adjacent to other NULLs */
-static noinline void check_null_expand(struct maple_tree *mt)
+static noinline void __init check_null_expand(struct maple_tree *mt)
{
unsigned long i, max = 100;
unsigned char data_end;
@@ -35723,7 +35727,7 @@ static noinline void check_null_expand(struct maple_tree *mt)
/* End of NULL area expansions */
/* Checking for no memory is best done outside the kernel */
-static noinline void check_nomem(struct maple_tree *mt)
+static noinline void __init check_nomem(struct maple_tree *mt)
{
MA_STATE(ms, mt, 1, 1);
@@ -35758,7 +35762,7 @@ static noinline void check_nomem(struct maple_tree *mt)
mtree_destroy(mt);
}
-static noinline void check_locky(struct maple_tree *mt)
+static noinline void __init check_locky(struct maple_tree *mt)
{
MA_STATE(ms, mt, 2, 2);
MA_STATE(reader, mt, 2, 2);
@@ -35780,10 +35784,10 @@ void farmer_tests(void)
struct maple_node *node;
DEFINE_MTREE(tree);
- mt_dump(&tree);
+ mt_dump(&tree, mt_dump_dec);
tree.ma_root = xa_mk_value(0);
- mt_dump(&tree);
+ mt_dump(&tree, mt_dump_dec);
node = mt_alloc_one(GFP_KERNEL);
node->parent = (void *)((unsigned long)(&tree) | 1);
@@ -35793,7 +35797,7 @@ void farmer_tests(void)
node->mr64.pivot[1] = 1;
node->mr64.pivot[2] = 0;
tree.ma_root = mt_mk_node(node, maple_leaf_64);
- mt_dump(&tree);
+ mt_dump(&tree, mt_dump_dec);
node->parent = ma_parent_ptr(node);
ma_free_rcu(node);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 90a62cf75008..5d6fc3f39284 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += amd-pstate
TARGETS += arm64
TARGETS += bpf
TARGETS += breakpoints
+TARGETS += cachestat
TARGETS += capabilities
TARGETS += cgroup
TARGETS += clone3
diff --git a/tools/testing/selftests/cachestat/.gitignore b/tools/testing/selftests/cachestat/.gitignore
new file mode 100644
index 000000000000..d6c30b43a4bb
--- /dev/null
+++ b/tools/testing/selftests/cachestat/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+test_cachestat
diff --git a/tools/testing/selftests/cachestat/Makefile b/tools/testing/selftests/cachestat/Makefile
new file mode 100644
index 000000000000..fca73aaa7d14
--- /dev/null
+++ b/tools/testing/selftests/cachestat/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := test_cachestat
+
+CFLAGS += $(KHDR_INCLUDES)
+CFLAGS += -Wall
+CFLAGS += -lrt
+
+include ../lib.mk
diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c
new file mode 100644
index 000000000000..54d09b820ed4
--- /dev/null
+++ b/tools/testing/selftests/cachestat/test_cachestat.c
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <linux/kernel.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include "../kselftest.h"
+
+static const char * const dev_files[] = {
+ "/dev/zero", "/dev/null", "/dev/urandom",
+ "/proc/version", "/proc"
+};
+static const int cachestat_nr = 451;
+
+void print_cachestat(struct cachestat *cs)
+{
+ ksft_print_msg(
+ "Using cachestat: Cached: %lu, Dirty: %lu, Writeback: %lu, Evicted: %lu, Recently Evicted: %lu\n",
+ cs->nr_cache, cs->nr_dirty, cs->nr_writeback,
+ cs->nr_evicted, cs->nr_recently_evicted);
+}
+
+bool write_exactly(int fd, size_t filesize)
+{
+ int random_fd = open("/dev/urandom", O_RDONLY);
+ char *cursor, *data;
+ int remained;
+ bool ret;
+
+ if (random_fd < 0) {
+ ksft_print_msg("Unable to access urandom.\n");
+ ret = false;
+ goto out;
+ }
+
+ data = malloc(filesize);
+ if (!data) {
+ ksft_print_msg("Unable to allocate data.\n");
+ ret = false;
+ goto close_random_fd;
+ }
+
+ remained = filesize;
+ cursor = data;
+
+ while (remained) {
+ ssize_t read_len = read(random_fd, cursor, remained);
+
+ if (read_len <= 0) {
+ ksft_print_msg("Unable to read from urandom.\n");
+ ret = false;
+ goto out_free_data;
+ }
+
+ remained -= read_len;
+ cursor += read_len;
+ }
+
+ /* write random data to fd */
+ remained = filesize;
+ cursor = data;
+ while (remained) {
+ ssize_t write_len = write(fd, cursor, remained);
+
+ if (write_len <= 0) {
+ ksft_print_msg("Unable write random data to file.\n");
+ ret = false;
+ goto out_free_data;
+ }
+
+ remained -= write_len;
+ cursor += write_len;
+ }
+
+ ret = true;
+out_free_data:
+ free(data);
+close_random_fd:
+ close(random_fd);
+out:
+ return ret;
+}
+
+/*
+ * Open/create the file at filename, (optionally) write random data to it
+ * (exactly num_pages), then test the cachestat syscall on this file.
+ *
+ * If test_fsync == true, fsync the file, then check the number of dirty
+ * pages.
+ */
+bool test_cachestat(const char *filename, bool write_random, bool create,
+ bool test_fsync, unsigned long num_pages, int open_flags,
+ mode_t open_mode)
+{
+ size_t PS = sysconf(_SC_PAGESIZE);
+ int filesize = num_pages * PS;
+ bool ret = true;
+ long syscall_ret;
+ struct cachestat cs;
+ struct cachestat_range cs_range = { 0, filesize };
+
+ int fd = open(filename, open_flags, open_mode);
+
+ if (fd == -1) {
+ ksft_print_msg("Unable to create/open file.\n");
+ ret = false;
+ goto out;
+ } else {
+ ksft_print_msg("Create/open %s\n", filename);
+ }
+
+ if (write_random) {
+ if (!write_exactly(fd, filesize)) {
+ ksft_print_msg("Unable to access urandom.\n");
+ ret = false;
+ goto out1;
+ }
+ }
+
+ syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0);
+
+ ksft_print_msg("Cachestat call returned %ld\n", syscall_ret);
+
+ if (syscall_ret) {
+ ksft_print_msg("Cachestat returned non-zero.\n");
+ ret = false;
+ goto out1;
+
+ } else {
+ print_cachestat(&cs);
+
+ if (write_random) {
+ if (cs.nr_cache + cs.nr_evicted != num_pages) {
+ ksft_print_msg(
+ "Total number of cached and evicted pages is off.\n");
+ ret = false;
+ }
+ }
+ }
+
+ if (test_fsync) {
+ if (fsync(fd)) {
+ ksft_print_msg("fsync fails.\n");
+ ret = false;
+ } else {
+ syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0);
+
+ ksft_print_msg("Cachestat call (after fsync) returned %ld\n",
+ syscall_ret);
+
+ if (!syscall_ret) {
+ print_cachestat(&cs);
+
+ if (cs.nr_dirty) {
+ ret = false;
+ ksft_print_msg(
+ "Number of dirty should be zero after fsync.\n");
+ }
+ } else {
+ ksft_print_msg("Cachestat (after fsync) returned non-zero.\n");
+ ret = false;
+ goto out1;
+ }
+ }
+ }
+
+out1:
+ close(fd);
+
+ if (create)
+ remove(filename);
+out:
+ return ret;
+}
+
+bool test_cachestat_shmem(void)
+{
+ size_t PS = sysconf(_SC_PAGESIZE);
+ size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */
+ int syscall_ret;
+ size_t compute_len = PS * 512;
+ struct cachestat_range cs_range = { PS, compute_len };
+ char *filename = "tmpshmcstat";
+ struct cachestat cs;
+ bool ret = true;
+ unsigned long num_pages = compute_len / PS;
+ int fd = shm_open(filename, O_CREAT | O_RDWR, 0600);
+
+ if (fd < 0) {
+ ksft_print_msg("Unable to create shmem file.\n");
+ ret = false;
+ goto out;
+ }
+
+ if (ftruncate(fd, filesize)) {
+ ksft_print_msg("Unable to truncate shmem file.\n");
+ ret = false;
+ goto close_fd;
+ }
+
+ if (!write_exactly(fd, filesize)) {
+ ksft_print_msg("Unable to write to shmem file.\n");
+ ret = false;
+ goto close_fd;
+ }
+
+ syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0);
+
+ if (syscall_ret) {
+ ksft_print_msg("Cachestat returned non-zero.\n");
+ ret = false;
+ goto close_fd;
+ } else {
+ print_cachestat(&cs);
+ if (cs.nr_cache + cs.nr_evicted != num_pages) {
+ ksft_print_msg(
+ "Total number of cached and evicted pages is off.\n");
+ ret = false;
+ }
+ }
+
+close_fd:
+ shm_unlink(filename);
+out:
+ return ret;
+}
+
+int main(void)
+{
+ int ret = 0;
+
+ for (int i = 0; i < 5; i++) {
+ const char *dev_filename = dev_files[i];
+
+ if (test_cachestat(dev_filename, false, false, false,
+ 4, O_RDONLY, 0400))
+ ksft_test_result_pass("cachestat works with %s\n", dev_filename);
+ else {
+ ksft_test_result_fail("cachestat fails with %s\n", dev_filename);
+ ret = 1;
+ }
+ }
+
+ if (test_cachestat("tmpfilecachestat", true, true,
+ true, 4, O_CREAT | O_RDWR, 0400 | 0600))
+ ksft_test_result_pass("cachestat works with a normal file\n");
+ else {
+ ksft_test_result_fail("cachestat fails with normal file\n");
+ ret = 1;
+ }
+
+ if (test_cachestat_shmem())
+ ksft_test_result_pass("cachestat works with a shmem file\n");
+ else {
+ ksft_test_result_fail("cachestat fails with a shmem file\n");
+ ret = 1;
+ }
+
+ return ret;
+}
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 9bfe1d6f6529..e033c79d528e 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -61,8 +61,7 @@ static void async_pf_execute(struct work_struct *work)
* access remotely.
*/
mmap_read_lock(mm);
- get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, NULL,
- &locked);
+ get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked);
if (locked)
mmap_read_unlock(mm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cb5c13eee193..eaa5bb8dbadc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2477,7 +2477,7 @@ static inline int check_user_page_hwpoison(unsigned long addr)
{
int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
- rc = get_user_pages(addr, 1, flags, NULL, NULL);
+ rc = get_user_pages(addr, 1, flags, NULL);
return rc == -EHWPOISON;
}