diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2018-01-29 18:41:34 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2018-01-29 18:41:34 +1100 |
commit | 433ac687a018c5ed2572418a263f384c08d6ad18 (patch) | |
tree | 9ada1230c2b92b1e3534ea33c941cdc5ba32f1c9 | |
parent | 1ce4714d523620b40fdc6ab0f785cdcbd79614f8 (diff) | |
parent | 7ada87bbb98dfeca138875718ce9caf397bf40ef (diff) | |
download | linux-next-433ac687a018c5ed2572418a263f384c08d6ad18.tar.gz |
Merge branch 'akpm-current/current'
284 files changed, 5353 insertions, 3567 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index d773d5697cf5..3187a18af6da 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -53,3 +53,11 @@ stable_pages_required (read-only) If set, the backing device requires that all pages comprising a write request must not be changed until writeout is complete. + +strictlimit (read-write) + + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 74cdeaed9f7a..85ff09c695bd 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -48,6 +48,7 @@ v1 is available under Documentation/cgroup-v1/. 5-2-1. Memory Interface Files 5-2-2. Usage Guidelines 5-2-3. Memory Ownership + 5-2-4. OOM Killer 5-3. IO 5-3-1. IO Interface Files 5-3-2. Writeback @@ -1039,6 +1040,31 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. + memory.oom_group + + A read-write single value file which exists on non-root + cgroups. The default is "0". + + If set, OOM killer will consider the memory cgroup as an + indivisible memory consumers and compare it with other memory + consumers by it's memory footprint. + If such memory cgroup is selected as an OOM victim, all + processes belonging to it or it's descendants will be killed. + + This applies to system-wide OOM conditions and reaching + the hard memory limit of the cgroup and their ancestor. + If OOM condition happens in a descendant cgroup with it's own + memory limit, the memory cgroup can't be considered + as an OOM victim, and OOM killer will not kill all belonging + tasks. + + Also, OOM killer respects the /proc/pid/oom_score_adj value -1000, + and will never kill the unkillable task, even if memory.oom_group + is set. + + If cgroup-aware OOM killer is not enabled, ENOTSUPP error + is returned on attempt to access the file. + memory.events A read-only flat-keyed file which exists on non-root cgroups. The following entries are defined. Unless specified @@ -1242,6 +1268,47 @@ to be accessed repeatedly by other cgroups, it may make sense to use POSIX_FADV_DONTNEED to relinquish the ownership of memory areas belonging to the affected files to ensure correct memory ownership. +OOM Killer +~~~~~~~~~~ + +Cgroup v2 memory controller implements a cgroup-aware OOM killer. +It means that it treats cgroups as first class OOM entities. + +Cgroup-aware OOM logic is turned off by default and requires +passing the "groupoom" option on mounting cgroupfs. It can also +by remounting cgroupfs with the following command:: + + # mount -o remount,groupoom $MOUNT_POINT + +Under OOM conditions the memory controller tries to make the best +choice of a victim, looking for a memory cgroup with the largest +memory footprint, considering leaf cgroups and cgroups with the +memory.oom_group option set, which are considered to be an indivisible +memory consumers. + +By default, OOM killer will kill the biggest task in the selected +memory cgroup. A user can change this behavior by enabling +the per-cgroup memory.oom_group option. If set, it causes +the OOM killer to kill all processes attached to the cgroup, +except processes with oom_score_adj set to -1000. + +This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM +the memory controller considers only cgroups belonging to the sub-tree +of the OOM'ing cgroup. + +The root cgroup is treated as a leaf memory cgroup, so it's compared +with other leaf memory cgroups and cgroups with oom_group option set. + +If there are no cgroups with the enabled memory controller, +the OOM killer is using the "traditional" process-based approach. + +Please, note that memory charges are not migrating if tasks +are moved between different memory cgroups. Moving tasks with +significant memory footprint may affect OOM victim selection logic. +If it's a case, please, consider creating a common ancestor for +the source and destination memory cgroups and enabling oom_group +on ancestor layer. + IO -- diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 5025ff9307e6..17256f2ad919 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -30,7 +30,6 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold -- hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode - legacy_va_layout @@ -261,30 +260,6 @@ any throttling. ============================================================== -hugepages_treat_as_movable - -This parameter controls whether we can allocate hugepages from ZONE_MOVABLE -or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE. -ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified, -so this parameter has no effect if used without kernelcore=. - -Hugepage migration is now available in some situations which depend on the -architecture and/or the hugepage size. If a hugepage supports migration, -allocation from ZONE_MOVABLE is always enabled for the hugepage regardless -of the value of this parameter. -IOW, this parameter affects only non-migratable hugepages. - -Assuming that hugepages are not migratable in your system, one usecase of -this parameter is that users can make hugepage pool more extensible by -enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE -page reclaim/migration/compaction work more and you can get contiguous -memory more likely. Note that using ZONE_MOVABLE for non-migratable -hugepages can do harm to other features like memory hotremove (because -memory hotremove expects that memory blocks on ZONE_MOVABLE are always -removable,) so it's a trade-off responsible for the users. - -============================================================== - hugetlb_shm_group hugetlb_shm_group contains group id that is allowed to create SysV @@ -337,8 +312,6 @@ The lowmem_reserve_ratio is an array. You can see them by reading this file. % cat /proc/sys/vm/lowmem_reserve_ratio 256 256 32 - -Note: # of this elements is one fewer than number of zones. Because the highest - zone's value is not necessary for following calculation. But, these values are not used directly. The kernel calculates # of protection pages for each zones from them. These are shown as array of protection pages @@ -389,7 +362,8 @@ As above expression, they are reciprocal number of ratio. pages of higher zones on the node. If you would like to protect more pages, smaller values are effective. -The minimum value is 1 (1/1 -> 100%). +The minimum value is 1 (1/1 -> 100%). The value less than 1 completely +disables protection of the pages. ============================================================== diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 59cbc803aad6..faf077d50d42 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -20,19 +20,20 @@ options. The /proc/meminfo file provides information about the total number of persistent hugetlb pages in the kernel's huge page pool. It also displays -information about the number of free, reserved and surplus huge pages and the -default huge page size. The huge page size is needed for generating the -proper alignment and size of the arguments to system calls that map huge page -regions. +default huge page size and information about the number of free, reserved +and surplus huge pages in the pool of huge pages of default size. +The huge page size is needed for generating the proper alignment and +size of the arguments to system calls that map huge page regions. The output of "cat /proc/meminfo" will include lines like: ..... -HugePages_Total: vvv -HugePages_Free: www -HugePages_Rsvd: xxx -HugePages_Surp: yyy -Hugepagesize: zzz kB +HugePages_Total: uuu +HugePages_Free: vvv +HugePages_Rsvd: www +HugePages_Surp: xxx +Hugepagesize: yyy kB +Hugetlb: zzz kB where: HugePages_Total is the size of the pool of huge pages. @@ -47,6 +48,14 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. The maximum number of surplus huge pages is controlled by /proc/sys/vm/nr_overcommit_hugepages. +Hugepagesize is the default hugepage size (in Kb). +Hugetlb is the total amount of memory (in kB), consumed by huge + pages of all sizes. + If huge pages of different sizes are in use, this number + will exceed HugePages_Total * Hugepagesize. To get more + detailed information, please, refer to + /sys/kernel/mm/hugepages (described below). + /proc/filesystems should also show a filesystem of type "hugetlbfs" configured in the kernel. @@ -434,7 +434,8 @@ export MAKE LEX YACC AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS -export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_KASAN CFLAGS_UBSAN +export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE +export CFLAGS_KASAN CFLAGS_KASAN_NOSANITIZE CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL @@ -679,6 +680,10 @@ endif # This selects the stack protector compiler flag. Testing it is delayed # until after .config has been reprocessed, in the prepare-compiler-check # target. +ifdef CONFIG_CC_STACKPROTECTOR_AUTO + stackp-flag := $(call cc-option,-fstack-protector-strong,$(call cc-option,-fstack-protector)) + stackp-name := AUTO +else ifdef CONFIG_CC_STACKPROTECTOR_REGULAR stackp-flag := -fstack-protector stackp-name := REGULAR @@ -687,16 +692,40 @@ ifdef CONFIG_CC_STACKPROTECTOR_STRONG stackp-flag := -fstack-protector-strong stackp-name := STRONG else + # If either there is no stack protector for this architecture or + # CONFIG_CC_STACKPROTECTOR_NONE is selected, we're done, and $(stackp-name) + # is empty, skipping all remaining stack protector tests. + # # Force off for distro compilers that enable stack protector by default. - stackp-flag := $(call cc-option, -fno-stack-protector) + KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) +endif endif endif # Find arch-specific stack protector compiler sanity-checking script. -ifdef CONFIG_CC_STACKPROTECTOR +ifdef stackp-name +ifneq ($(stackp-flag),) stackp-path := $(srctree)/scripts/gcc-$(SRCARCH)_$(BITS)-has-stack-protector.sh stackp-check := $(wildcard $(stackp-path)) + # If the wildcard test matches a test script, run it to check functionality. + ifdef stackp-check + ifneq ($(shell $(CONFIG_SHELL) $(stackp-check) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) + stackp-broken := y + endif + endif + ifndef stackp-broken + # If the stack protector is functional, enable code that depends on it. + KBUILD_CPPFLAGS += -DCONFIG_CC_STACKPROTECTOR + # Either we've already detected the flag (for AUTO) or we'll fail the + # build in the prepare-compiler-check rule (for specific flag). + KBUILD_CFLAGS += $(stackp-flag) + else + # We have to make sure stack protector is unconditionally disabled if + # the compiler is broken (in case we're going to continue the build in + # AUTO mode). + KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) + endif +endif endif -KBUILD_CFLAGS += $(stackp-flag) ifeq ($(cc-name),clang) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) @@ -1091,14 +1120,25 @@ PHONY += prepare-compiler-check prepare-compiler-check: FORCE # Make sure compiler supports requested stack protector flag. ifdef stackp-name + # Warn about CONFIG_CC_STACKPROTECTOR_AUTO having found no option. + ifeq ($(stackp-flag),) + @echo CONFIG_CC_STACKPROTECTOR_$(stackp-name): \ + Compiler does not support any known stack-protector >&2 + else + # Fail if specifically requested stack protector is missing. ifeq ($(call cc-option, $(stackp-flag)),) @echo Cannot use CONFIG_CC_STACKPROTECTOR_$(stackp-name): \ $(stackp-flag) not supported by compiler >&2 && exit 1 endif + endif endif -# Make sure compiler does not have buggy stack-protector support. -ifdef stackp-check - ifneq ($(shell $(CONFIG_SHELL) $(stackp-check) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) +# Make sure compiler does not have buggy stack-protector support. If a +# specific stack-protector was requested, fail the build, otherwise warn. +ifdef stackp-broken + ifeq ($(stackp-name),AUTO) + @echo CONFIG_CC_STACKPROTECTOR_$(stackp-name): \ + $(stackp-flag) available but compiler is broken: disabling >&2 + else @echo Cannot use CONFIG_CC_STACKPROTECTOR_$(stackp-name): \ $(stackp-flag) available but compiler is broken >&2 && exit 1 endif diff --git a/arch/Kconfig b/arch/Kconfig index 467dfa35bf96..76c0b54443b1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -538,16 +538,10 @@ config HAVE_CC_STACKPROTECTOR - its compiler supports the -fstack-protector option - it has implemented a stack canary (e.g. __stack_chk_guard) -config CC_STACKPROTECTOR - def_bool n - help - Set when a stack-protector mode is enabled, so that the build - can enable kernel-side support for the GCC feature. - choice prompt "Stack Protector buffer overflow detection" depends on HAVE_CC_STACKPROTECTOR - default CC_STACKPROTECTOR_NONE + default CC_STACKPROTECTOR_AUTO help This option turns on the "stack-protector" GCC feature. This feature puts, at the beginning of functions, a canary value on @@ -564,7 +558,6 @@ config CC_STACKPROTECTOR_NONE config CC_STACKPROTECTOR_REGULAR bool "Regular" - select CC_STACKPROTECTOR help Functions will have the stack-protector canary logic added if they have an 8-byte or larger character array on the stack. @@ -578,7 +571,6 @@ config CC_STACKPROTECTOR_REGULAR config CC_STACKPROTECTOR_STRONG bool "Strong" - select CC_STACKPROTECTOR help Functions will have the stack-protector canary logic added in any of the following conditions: @@ -596,6 +588,12 @@ config CC_STACKPROTECTOR_STRONG about 20% of all kernel functions, which increases the kernel code size by about 2%. +config CC_STACKPROTECTOR_AUTO + bool "Automatic" + help + If the compiler supports it, the best available stack-protector + option will be chosen. + endchoice config THIN_ARCHIVES diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 2dbdf59258d9..f9d4e6b6d4bd 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -32,6 +32,7 @@ #define MAP_NONBLOCK 0x40000 /* do not block on IO */ #define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x100000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h index ea022d47896c..21ec82466d62 100644 --- a/arch/arc/include/asm/bug.h +++ b/arch/arc/include/asm/bug.h @@ -23,7 +23,8 @@ void die(const char *str, struct pt_regs *regs, unsigned long address); #define BUG() do { \ pr_warn("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \ - dump_stack(); \ + barrier_before_unreachable(); \ + __builtin_trap(); \ } while (0) #define HAVE_ARCH_BUG diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index b18fcb606908..dc8ee011882f 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -74,4 +74,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/ +#define pmdp_establish generic_pmdp_establish + #endif diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h index ce5ee762ed66..4cab9bb823fb 100644 --- a/arch/arm/include/asm/bitops.h +++ b/arch/arm/include/asm/bitops.h @@ -338,6 +338,7 @@ static inline int find_next_bit_le(const void *p, int size, int offset) #endif +#include <asm-generic/bitops/find.h> #include <asm-generic/bitops/le.h> /* diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include <linux/personality.h> /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include <asm/page-nommu.h> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 1a7a17b2a1ba..2a4836087358 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -249,6 +249,9 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) +/* No hardware dirty/accessed bits -- generic_pmdp_establish() fits */ +#define pmdp_establish generic_pmdp_establish + /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */ static inline pmd_t pmd_mknotpresent(pmd_t pmd) { diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ada8eb206a90..8c398fedbbb6 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -466,6 +466,12 @@ void __init dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) void __init dma_contiguous_remap(void) { int i; + + if (!dma_mmu_remap_num) + return; + + /* call flush_cache_all() since CMA area would be large enough */ + flush_cache_all(); for (i = 0; i < dma_mmu_remap_num; i++) { phys_addr_t start = dma_mmu_remap[i].base; phys_addr_t end = start + dma_mmu_remap[i].size; @@ -498,7 +504,15 @@ void __init dma_contiguous_remap(void) flush_tlb_kernel_range(__phys_to_virt(start), __phys_to_virt(end)); - iotable_init(&map, 1); + /* + * All the memory in CMA region will be on ZONE_MOVABLE. + * If that zone is considered as highmem, the memory in CMA + * region is also considered as highmem even if it's + * physical address belong to lowmem. In this case, + * re-mapping isn't required. + */ + if (!is_highmem_idx(ZONE_MOVABLE)) + iotable_init(&map, 1); } } diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index e266f80e45b7..8758bb008436 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -12,7 +12,8 @@ /* * KASAN_SHADOW_START: beginning of the kernel virtual addresses. - * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses. + * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/N of kernel virtual addresses, + * where N = (1 << KASAN_SHADOW_SCALE_SHIFT). */ #define KASAN_SHADOW_START (VA_START) #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) @@ -20,14 +21,16 @@ /* * This value is used to map an address to the corresponding shadow * address by the following formula: - * shadow_addr = (address >> 3) + KASAN_SHADOW_OFFSET; + * shadow_addr = (address >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET * - * (1 << 61) shadow addresses - [KASAN_SHADOW_OFFSET,KASAN_SHADOW_END] - * cover all 64-bits of virtual addresses. So KASAN_SHADOW_OFFSET - * should satisfy the following equation: - * KASAN_SHADOW_OFFSET = KASAN_SHADOW_END - (1ULL << 61) + * (1 << (64 - KASAN_SHADOW_SCALE_SHIFT)) shadow addresses that lie in range + * [KASAN_SHADOW_OFFSET, KASAN_SHADOW_END) cover all 64-bits of virtual + * addresses. So KASAN_SHADOW_OFFSET should satisfy the following equation: + * KASAN_SHADOW_OFFSET = KASAN_SHADOW_END - + * (1ULL << (64 - KASAN_SHADOW_SCALE_SHIFT)) */ -#define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << (64 - 3))) +#define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << \ + (64 - KASAN_SHADOW_SCALE_SHIFT))) void kasan_init(void); void kasan_copy_shadow(pgd_t *pgdir); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index d4bae7d6e0d8..50fa96a49792 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -85,7 +85,8 @@ * stack size when KASAN is in use. */ #ifdef CONFIG_KASAN -#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3)) +#define KASAN_SHADOW_SCALE_SHIFT 3 +#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT)) #define KASAN_THREAD_SHIFT 1 #else #define KASAN_SHADOW_SIZE (0) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 89167c43ebb5..094374c82db0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -706,6 +706,13 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, { ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } + +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); +} #endif extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 3affca3dd96a..75b220ba73a3 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -925,9 +925,8 @@ static void __armv8pmu_probe_pmu(void *info) pmceid[0] = read_sysreg(pmceid0_el0); pmceid[1] = read_sysreg(pmceid1_el0); - bitmap_from_u32array(cpu_pmu->pmceid_bitmap, - ARMV8_PMUV3_MAX_COMMON_EVENTS, pmceid, - ARRAY_SIZE(pmceid)); + bitmap_from_arr32(cpu_pmu->pmceid_bitmap, + pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); } static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index acba49fb5aac..6e02e6fb4c7b 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -135,7 +135,8 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, /* The early shadow maps everything to a single page of zeroes */ asmlinkage void __init kasan_early_init(void) { - BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61)); + BUILD_BUG_ON(KASAN_SHADOW_OFFSET != + KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE, diff --git a/arch/cris/include/arch-v10/arch/bug.h b/arch/cris/include/arch-v10/arch/bug.h index 905afeacfedf..06da9d49152a 100644 --- a/arch/cris/include/arch-v10/arch/bug.h +++ b/arch/cris/include/arch-v10/arch/bug.h @@ -44,18 +44,25 @@ struct bug_frame { * not be used like this with newer versions of gcc. */ #define BUG() \ +do { \ __asm__ __volatile__ ("clear.d [" __stringify(BUG_MAGIC) "]\n\t"\ "movu.w " __stringify(__LINE__) ",$r0\n\t"\ "jump 0f\n\t" \ ".section .rodata\n" \ "0:\t.string \"" __FILE__ "\"\n\t" \ - ".previous") + ".previous"); \ + unreachable(); \ +} while (0) #endif #else /* This just causes an oops. */ -#define BUG() (*(int *)0 = 0) +#define BUG() \ +do { \ + barrier_before_unreachable(); \ + __builtin_trap(); \ +} while (0) #endif diff --git a/arch/ia64/include/asm/bug.h b/arch/ia64/include/asm/bug.h index bd3eeb8d1cfa..66b37a532765 100644 --- a/arch/ia64/include/asm/bug.h +++ b/arch/ia64/include/asm/bug.h @@ -4,7 +4,11 @@ #ifdef CONFIG_BUG #define ia64_abort() __builtin_trap() -#define BUG() do { printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); ia64_abort(); } while (0) +#define BUG() do { \ + printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ + barrier_before_unreachable(); \ + ia64_abort(); \ +} while (0) /* should this BUG be made generic? */ #define HAVE_ARCH_BUG diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index c44f002e8f6b..858602494096 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2610,17 +2610,10 @@ pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task) if (pid < 2) return -EPERM; if (pid != task_pid_vnr(current)) { - - read_lock(&tasklist_lock); - - p = find_task_by_vpid(pid); - /* make sure task cannot go away while we operate on it */ - if (p) get_task_struct(p); - - read_unlock(&tasklist_lock); - - if (p == NULL) return -ESRCH; + p = find_get_task_by_vpid(pid); + if (!p) + return -ESRCH; } ret = pfm_task_incompatible(ctx, p); diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index b88a8dd14933..a6f300a208bd 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -115,14 +115,6 @@ static void set_eit_vector_entries(void) _flush_cache_copyback_all(); } -void abort(void) -{ - BUG(); - - /* if that doesn't kill us, halt */ - panic("Oops failed to kill thread"); -} - void __init trap_init(void) { set_eit_vector_entries(); diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index dda58cfe8c22..93b47b1f6fb4 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -311,7 +311,6 @@ static inline int bfchg_mem_test_and_change_bit(int nr, * functions. */ #if defined(CONFIG_CPU_HAS_NO_BITFIELDS) -#include <asm-generic/bitops/find.h> #include <asm-generic/bitops/ffz.h> #else @@ -441,6 +440,8 @@ static inline unsigned long ffz(unsigned long word) #endif +#include <asm-generic/bitops/find.h> + #ifdef __KERNEL__ #if defined(CONFIG_CPU_HAS_NO_BITFIELDS) diff --git a/arch/m68k/include/asm/bug.h b/arch/m68k/include/asm/bug.h index b7e2bf1ba4a6..275dca1435bf 100644 --- a/arch/m68k/include/asm/bug.h +++ b/arch/m68k/include/asm/bug.h @@ -8,16 +8,19 @@ #ifndef CONFIG_SUN3 #define BUG() do { \ pr_crit("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ + barrier_before_unreachable(); \ __builtin_trap(); \ } while (0) #else #define BUG() do { \ pr_crit("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ + barrier_before_unreachable(); \ panic("BUG!"); \ } while (0) #endif #else #define BUG() do { \ + barrier_before_unreachable(); \ __builtin_trap(); \ } while (0) #endif diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c index 0909834c83a7..0cca2c95a091 100644 --- a/arch/metag/kernel/process.c +++ b/arch/metag/kernel/process.c @@ -399,7 +399,7 @@ unsigned long __metag_elf_map(struct file *filep, unsigned long addr, tcm_tag = tcm_lookup_tag(addr); if (tcm_tag != TCM_INVALID_TAG) - type &= ~MAP_FIXED; + type &= ~(MAP_FIXED | MAP_FIXED_NOREPLACE); /* * total_size is the size of the ELF (interpreter) image. @@ -417,6 +417,10 @@ unsigned long __metag_elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); + if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) + pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, (void *)addr); + if (!BAD_ADDR(map_addr) && tcm_tag != TCM_INVALID_TAG) { struct tcm_allocation *tcm; unsigned long tcm_addr; diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 1a508a74d48d..129e0328367f 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -534,6 +534,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/ +#define pmdp_establish generic_pmdp_establish + #define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 606e02ca4b6c..3035ca499cd8 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -50,6 +50,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ /* * Flags for msync diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 80510ba44c08..0ebe24344a34 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -26,6 +26,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index cc5e1a1f4e6a..9d3329811cc1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -153,7 +153,6 @@ config PPC select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 8a91f2475ea3..949d691094a4 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -121,8 +121,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 62e101b43cf6..338b7da468ce 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -232,8 +232,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 1c495068bcfa..51017726d495 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1212,17 +1212,8 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, } #define __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp); - -#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - if (radix_enabled()) - return radix__pmdp_huge_split_prepare(vma, address, pmdp); - return hash__pmdp_huge_split_prepare(vma, address, pmdp); -} +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp); #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 19c44e1495ae..365010f66570 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -269,12 +269,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE); return __pmd(pmd_val(pmd) | _PAGE_PTE); } -static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - /* Nothing to do for radix. */ - return; -} extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e0a2d8e806ed..91ee2231c527 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -75,8 +75,7 @@ EXPORT_SYMBOL_GPL(mm_iommu_preregistered); /* * Taken from alloc_migrate_target with changes to remove CMA allocations */ -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private, - int **resultp) +struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) { gfp_t gfp_mask = GFP_USER; struct page *new_page; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 3b65917785a5..422e80253a33 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -90,16 +90,19 @@ void serialize_against_pte_lookup(struct mm_struct *mm) * We use this to invalidate a pmdp entry before switching from a * hugepte to regular pmd entry. */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + unsigned long old_pmd; + + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. */ serialize_against_pte_lookup(vma->vm_mm); + return __pmd(old_pmd); } static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index ec277913e01b..469808e77e58 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } -void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); - VM_BUG_ON(pmd_devmap(*pmdp)); - - /* - * We can't mark the pmd none here, because that will cause a race - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while - * we spilt, but at the same time we wan't rest of the ppc64 code - * not to insert hash pte on this, because we will be modifying - * the deposited pgtable in the caller of this function. Hence - * clear the _PAGE_USER so that we move the fault handling to - * higher level function and that will serialize against ptl. - * We need to flush existing hash pte entries here even though, - * the translation is still valid, because we will withdraw - * pgtable_t after this. - */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); -} - /* * A linux hugepage PMD was changed and the corresponding hash table entries * neesd to be flushed. diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9376637229c9..0105ce28e246 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -108,7 +108,6 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0a6b0286c32e..2d24d33bf188 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1505,12 +1505,12 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_INVALIDATE -static inline void pmdp_invalidate(struct vm_area_struct *vma, +static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); - pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); + return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT diff --git a/arch/sparc/include/asm/bug.h b/arch/sparc/include/asm/bug.h index 6f17528356b2..ea53e418f6c0 100644 --- a/arch/sparc/include/asm/bug.h +++ b/arch/sparc/include/asm/bug.h @@ -9,10 +9,14 @@ void do_BUG(const char *file, int line); #define BUG() do { \ do_BUG(__FILE__, __LINE__); \ + barrier_before_unreachable(); \ __builtin_trap(); \ } while (0) #else -#define BUG() __builtin_trap() +#define BUG() do { \ + barrier_before_unreachable(); \ + __builtin_trap(); \ +} while (0) #endif #define HAVE_ARCH_BUG diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 9937c5ff94a9..339920fdf9ed 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1010,7 +1010,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); #define __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PGTABLE_DEPOSIT diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 4ae86bc0d35c..847ddffbf38a 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -219,17 +219,28 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } } +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old; + + do { + old = *pmdp; + } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + + return old; +} + /* * This routine is only called when splitting a THP */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t entry = *pmdp; - - pmd_val(entry) &= ~_PAGE_VALID; + pmd_t old, entry; - set_pmd_at(vma->vm_mm, address, pmdp, entry); + entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID); + old = pmdp_establish(vma, address, pmdp, entry); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* @@ -240,6 +251,8 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, if ((pmd_val(entry) & _PAGE_PMD_HUGE) && !is_huge_zero_page(pmd_page(entry))) (vma->vm_mm)->context.thp_pte_count--; + + return old; } void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, diff --git a/arch/unicore32/include/asm/bitops.h b/arch/unicore32/include/asm/bitops.h index 401f597bc38c..c0cbdbe17168 100644 --- a/arch/unicore32/include/asm/bitops.h +++ b/arch/unicore32/include/asm/bitops.h @@ -44,4 +44,6 @@ static inline int fls(int x) #define find_first_bit find_first_bit #define find_first_zero_bit find_first_zero_bit +#include <asm-generic/bitops/find.h> + #endif /* __UNICORE_BITOPS_H__ */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d6b35b9dc320..d9878f1f3bac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,7 +69,6 @@ config X86 select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS @@ -323,7 +322,7 @@ config X86_64_SMP config X86_32_LAZY_GS def_bool y - depends on X86_32 && !CC_STACKPROTECTOR + depends on X86_32 && CC_STACKPROTECTOR_NONE config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index b577dd0916aa..13e70da38bed 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -4,6 +4,7 @@ #include <linux/const.h> #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) +#define KASAN_SHADOW_SCALE_SHIFT 3 /* * Compiler uses shadow offset assuming that addresses start @@ -12,12 +13,15 @@ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT */ #define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ - ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3)) + ((-1UL << __VIRTUAL_MASK_SHIFT) >> \ + KASAN_SHADOW_SCALE_SHIFT)) /* - * 47 bits for kernel address -> (47 - 3) bits for shadow - * 56 bits for kernel address -> (56 - 3) bits for shadow + * 47 bits for kernel address -> (47 - KASAN_SHADOW_SCALE_SHIFT) bits for shadow + * 56 bits for kernel address -> (56 - KASAN_SHADOW_SCALE_SHIFT) bits for shadow */ -#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3))) +#define KASAN_SHADOW_END (KASAN_SHADOW_START + \ + (1ULL << (__VIRTUAL_MASK_SHIFT - \ + KASAN_SHADOW_SCALE_SHIFT))) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index bc4af5453802..f24df59c40b2 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -158,7 +158,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif -#ifdef CONFIG_SMP union split_pmd { struct { u32 pmd_low; @@ -166,6 +165,8 @@ union split_pmd { }; pmd_t pmd; }; + +#ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) { union split_pmd res, *orig = (union split_pmd *)pmdp; @@ -181,6 +182,40 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifndef pmdp_establish +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old; + + /* + * If pmd has present bit cleared we can get away without expensive + * cmpxchg64: we can update pmdp half-by-half without racing with + * anybody. + */ + if (!(pmd_val(pmd) & _PAGE_PRESENT)) { + union split_pmd old, new, *ptr; + + ptr = (union split_pmd *)pmdp; + + new.pmd = pmd; + + /* xchg acts as a barrier before setting of the high bits */ + old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); + old.pmd_high = ptr->pmd_high; + ptr->pmd_high = new.pmd_high; + return old.pmd; + } + + do { + old = *pmdp; + } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + + return old; +} +#endif + #ifdef CONFIG_SMP union split_pud { struct { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e42b8943cb1a..63c2552b6b65 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1109,6 +1109,21 @@ static inline int pud_write(pud_t pud) return pud_flags(pud) & _PAGE_RW; } +#ifndef pmdp_establish +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pmdp, pmd); + } else { + pmd_t old = *pmdp; + *pmdp = pmd; + return old; + } +} +#endif + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 1f790cf9d38f..ed77e9990146 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -358,6 +358,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 3e9d01ada81f..58f29a9d895d 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -57,6 +57,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED # define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/block/genhd.c b/block/genhd.c index 88a53c188cb7..4d42ae5e8382 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -970,7 +970,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e7b3ce123da6..70aceefe14d5 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = mmu_notifier_range_start, }; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 8696382be837..1d0b53a04a08 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = mn_release, .clear_flush_young = mn_clear_flush_young, .invalidate_range = mn_invalidate_range, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 859b61ddd508..35a408d0ae4f 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -291,6 +291,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops intel_mmuops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = intel_mm_release, .change_pte = intel_change_pte, .invalidate_range = intel_invalidate_range, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index 9918eda0e05f..a3454eb56fbf 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops gru_mmuops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = gru_invalidate_range_start, .invalidate_range_end = gru_invalidate_range_end, .release = gru_release, diff --git a/drivers/pps/generators/pps_gen_parport.c b/drivers/pps/generators/pps_gen_parport.c index dcd39fba6ddd..51cfde6afffd 100644 --- a/drivers/pps/generators/pps_gen_parport.c +++ b/drivers/pps/generators/pps_gen_parport.c @@ -70,7 +70,7 @@ static long hrtimer_error = SAFETY_INTERVAL; /* the kernel hrtimer event */ static enum hrtimer_restart hrtimer_event(struct hrtimer *timer) { - struct timespec expire_time, ts1, ts2, ts3, dts; + struct timespec64 expire_time, ts1, ts2, ts3, dts; struct pps_generator_pp *dev; struct parport *port; long lim, delta; @@ -78,7 +78,7 @@ static enum hrtimer_restart hrtimer_event(struct hrtimer *timer) /* We have to disable interrupts here. The idea is to prevent * other interrupts on the same processor to introduce random - * lags while polling the clock. getnstimeofday() takes <1us on + * lags while polling the clock. ktime_get_real_ts64() takes <1us on * most machines while other interrupt handlers can take much * more potentially. * @@ -88,22 +88,22 @@ static enum hrtimer_restart hrtimer_event(struct hrtimer *timer) local_irq_save(flags); /* first of all we get the time stamp... */ - getnstimeofday(&ts1); - expire_time = ktime_to_timespec(hrtimer_get_softexpires(timer)); + ktime_get_real_ts64(&ts1); + expire_time = ktime_to_timespec64(hrtimer_get_softexpires(timer)); dev = container_of(timer, struct pps_generator_pp, timer); lim = NSEC_PER_SEC - send_delay - dev->port_write_time; /* check if we are late */ if (expire_time.tv_sec != ts1.tv_sec || ts1.tv_nsec > lim) { local_irq_restore(flags); - pr_err("we are late this time %ld.%09ld\n", - ts1.tv_sec, ts1.tv_nsec); + pr_err("we are late this time %lld.%09ld\n", + (s64)ts1.tv_sec, ts1.tv_nsec); goto done; } /* busy loop until the time is right for an assert edge */ do { - getnstimeofday(&ts2); + ktime_get_real_ts64(&ts2); } while (expire_time.tv_sec == ts2.tv_sec && ts2.tv_nsec < lim); /* set the signal */ @@ -113,25 +113,25 @@ static enum hrtimer_restart hrtimer_event(struct hrtimer *timer) /* busy loop until the time is right for a clear edge */ lim = NSEC_PER_SEC - dev->port_write_time; do { - getnstimeofday(&ts2); + ktime_get_real_ts64(&ts2); } while (expire_time.tv_sec == ts2.tv_sec && ts2.tv_nsec < lim); /* unset the signal */ port->ops->write_control(port, NO_SIGNAL); - getnstimeofday(&ts3); + ktime_get_real_ts64(&ts3); local_irq_restore(flags); /* update calibrated port write time */ - dts = timespec_sub(ts3, ts2); + dts = timespec64_sub(ts3, ts2); dev->port_write_time = - (dev->port_write_time + timespec_to_ns(&dts)) >> 1; + (dev->port_write_time + timespec64_to_ns(&dts)) >> 1; done: /* update calibrated hrtimer error */ - dts = timespec_sub(ts1, expire_time); - delta = timespec_to_ns(&dts); + dts = timespec64_sub(ts1, expire_time); + delta = timespec64_to_ns(&dts); /* If the new error value is bigger then the old, use the new * value, if not then slowly move towards the new value. This * way it should be safe in bad conditions and efficient in @@ -161,17 +161,17 @@ static void calibrate_port(struct pps_generator_pp *dev) long acc = 0; for (i = 0; i < (1 << PORT_NTESTS_SHIFT); i++) { - struct timespec a, b; + struct timespec64 a, b; unsigned long irq_flags; local_irq_save(irq_flags); - getnstimeofday(&a); + ktime_get_real_ts64(&a); port->ops->write_control(port, NO_SIGNAL); - getnstimeofday(&b); + ktime_get_real_ts64(&b); local_irq_restore(irq_flags); - b = timespec_sub(b, a); - acc += timespec_to_ns(&b); + b = timespec64_sub(b, a); + acc += timespec64_to_ns(&b); } dev->port_write_time = acc >> PORT_NTESTS_SHIFT; @@ -180,9 +180,9 @@ static void calibrate_port(struct pps_generator_pp *dev) static inline ktime_t next_intr_time(struct pps_generator_pp *dev) { - struct timespec ts; + struct timespec64 ts; - getnstimeofday(&ts); + ktime_get_real_ts64(&ts); return ktime_set(ts.tv_sec + ((ts.tv_nsec > 990 * NSEC_PER_MSEC) ? 1 : 0), diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index e2a418598129..006ea5a45020 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -222,7 +222,7 @@ static int tsi721_bdma_ch_free(struct tsi721_bdma_chan *bdma_chan) struct tsi721_device *priv = to_tsi721(bdma_chan->dchan.device); #endif - if (bdma_chan->bd_base == NULL) + if (!bdma_chan->bd_base) return 0; /* Check if DMA channel still running */ @@ -346,7 +346,7 @@ tsi721_desc_fill_init(struct tsi721_tx_desc *desc, { u64 rio_addr; - if (bd_ptr == NULL) + if (!bd_ptr) return -EINVAL; /* Initialize DMA descriptor */ @@ -370,7 +370,7 @@ tsi721_desc_fill_init(struct tsi721_tx_desc *desc, static int tsi721_desc_fill_end(struct tsi721_dma_desc *bd_ptr, u32 bcount, bool interrupt) { - if (bd_ptr == NULL) + if (!bd_ptr) return -EINVAL; /* Update DMA descriptor */ @@ -555,9 +555,7 @@ static void tsi721_advance_work(struct tsi721_bdma_chan *bdma_chan, * If there is no data transfer in progress, fetch new descriptor from * the pending queue. */ - - if (desc == NULL && bdma_chan->active_tx == NULL && - !list_empty(&bdma_chan->queue)) { + if (!desc && !bdma_chan->active_tx && !list_empty(&bdma_chan->queue)) { desc = list_first_entry(&bdma_chan->queue, struct tsi721_tx_desc, desc_node); list_del_init((&desc->desc_node)); @@ -735,7 +733,7 @@ static dma_cookie_t tsi721_tx_submit(struct dma_async_tx_descriptor *txd) static int tsi721_alloc_chan_resources(struct dma_chan *dchan) { struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan); - struct tsi721_tx_desc *desc = NULL; + struct tsi721_tx_desc *desc; int i; tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id); @@ -754,9 +752,6 @@ static int tsi721_alloc_chan_resources(struct dma_chan *dchan) desc = kcalloc(dma_txqueue_sz, sizeof(struct tsi721_tx_desc), GFP_ATOMIC); if (!desc) { - tsi_err(&dchan->dev->device, - "DMAC%d Failed to allocate logical descriptors", - bdma_chan->id); tsi721_bdma_ch_free(bdma_chan); return -ENOMEM; } @@ -799,7 +794,7 @@ static void tsi721_free_chan_resources(struct dma_chan *dchan) tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id); - if (bdma_chan->bd_base == NULL) + if (!bdma_chan->bd_base) return; tsi721_bdma_interrupt_enable(bdma_chan, 0); diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 38d949405618..83406696c7aa 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -81,6 +81,7 @@ u16 rio_local_get_device_id(struct rio_mport *port) return (RIO_GET_DID(port->sys_size, result)); } +EXPORT_SYMBOL_GPL(rio_local_get_device_id); /** * rio_query_mport - Query mport device attributes @@ -110,9 +111,8 @@ EXPORT_SYMBOL(rio_query_mport); */ struct rio_net *rio_alloc_net(struct rio_mport *mport) { - struct rio_net *net; + struct rio_net *net = kzalloc(sizeof(*net), GFP_KERNEL); - net = kzalloc(sizeof(struct rio_net), GFP_KERNEL); if (net) { INIT_LIST_HEAD(&net->node); INIT_LIST_HEAD(&net->devices); @@ -243,18 +243,17 @@ int rio_request_inb_mbox(struct rio_mport *mport, int rc = -ENOSYS; struct resource *res; - if (mport->ops->open_inb_mbox == NULL) + if (!mport->ops->open_inb_mbox) goto out; - res = kzalloc(sizeof(struct resource), GFP_KERNEL); - + res = kzalloc(sizeof(*res), GFP_KERNEL); if (res) { rio_init_mbox_res(res, mbox, mbox); /* Make sure this mailbox isn't in use */ - if ((rc = - request_resource(&mport->riores[RIO_INB_MBOX_RESOURCE], - res)) < 0) { + rc = request_resource(&mport->riores[RIO_INB_MBOX_RESOURCE], + res); + if (rc < 0) { kfree(res); goto out; } @@ -277,6 +276,7 @@ int rio_request_inb_mbox(struct rio_mport *mport, out: return rc; } +EXPORT_SYMBOL_GPL(rio_request_inb_mbox); /** * rio_release_inb_mbox - release inbound mailbox message service @@ -305,6 +305,7 @@ int rio_release_inb_mbox(struct rio_mport *mport, int mbox) return 0; } +EXPORT_SYMBOL_GPL(rio_release_inb_mbox); /** * rio_request_outb_mbox - request outbound mailbox service @@ -326,18 +327,17 @@ int rio_request_outb_mbox(struct rio_mport *mport, int rc = -ENOSYS; struct resource *res; - if (mport->ops->open_outb_mbox == NULL) + if (!mport->ops->open_outb_mbox) goto out; - res = kzalloc(sizeof(struct resource), GFP_KERNEL); - + res = kzalloc(sizeof(*res), GFP_KERNEL); if (res) { rio_init_mbox_res(res, mbox, mbox); /* Make sure this outbound mailbox isn't in use */ - if ((rc = - request_resource(&mport->riores[RIO_OUTB_MBOX_RESOURCE], - res)) < 0) { + rc = request_resource(&mport->riores[RIO_OUTB_MBOX_RESOURCE], + res); + if (rc < 0) { kfree(res); goto out; } @@ -360,6 +360,7 @@ int rio_request_outb_mbox(struct rio_mport *mport, out: return rc; } +EXPORT_SYMBOL_GPL(rio_request_outb_mbox); /** * rio_release_outb_mbox - release outbound mailbox message service @@ -388,6 +389,7 @@ int rio_release_outb_mbox(struct rio_mport *mport, int mbox) return 0; } +EXPORT_SYMBOL_GPL(rio_release_outb_mbox); /** * rio_setup_inb_dbell - bind inbound doorbell callback @@ -405,13 +407,10 @@ rio_setup_inb_dbell(struct rio_mport *mport, void *dev_id, struct resource *res, void (*dinb) (struct rio_mport * mport, void *dev_id, u16 src, u16 dst, u16 info)) { - int rc = 0; - struct rio_dbell *dbell; + struct rio_dbell *dbell = kmalloc(sizeof(*dbell), GFP_KERNEL); - if (!(dbell = kmalloc(sizeof(struct rio_dbell), GFP_KERNEL))) { - rc = -ENOMEM; - goto out; - } + if (!dbell) + return -ENOMEM; dbell->res = res; dbell->dinb = dinb; @@ -420,9 +419,7 @@ rio_setup_inb_dbell(struct rio_mport *mport, void *dev_id, struct resource *res, mutex_lock(&mport->lock); list_add_tail(&dbell->node, &mport->dbells); mutex_unlock(&mport->lock); - - out: - return rc; + return 0; } /** @@ -444,17 +441,16 @@ int rio_request_inb_dbell(struct rio_mport *mport, void (*dinb) (struct rio_mport * mport, void *dev_id, u16 src, u16 dst, u16 info)) { - int rc = 0; - - struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL); + int rc; + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); if (res) { rio_init_dbell_res(res, start, end); /* Make sure these doorbells aren't in use */ - if ((rc = - request_resource(&mport->riores[RIO_DOORBELL_RESOURCE], - res)) < 0) { + rc = request_resource(&mport->riores[RIO_DOORBELL_RESOURCE], + res); + if (rc < 0) { kfree(res); goto out; } @@ -467,6 +463,7 @@ int rio_request_inb_dbell(struct rio_mport *mport, out: return rc; } +EXPORT_SYMBOL_GPL(rio_request_inb_dbell); /** * rio_release_inb_dbell - release inbound doorbell message service @@ -508,6 +505,7 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end) out: return rc; } +EXPORT_SYMBOL_GPL(rio_release_inb_dbell); /** * rio_request_outb_dbell - request outbound doorbell message range @@ -536,6 +534,7 @@ struct resource *rio_request_outb_dbell(struct rio_dev *rdev, u16 start, return res; } +EXPORT_SYMBOL_GPL(rio_request_outb_dbell); /** * rio_release_outb_dbell - release outbound doorbell message range @@ -553,6 +552,7 @@ int rio_release_outb_dbell(struct rio_dev *rdev, struct resource *res) return rc; } +EXPORT_SYMBOL_GPL(rio_release_outb_dbell); /** * rio_add_mport_pw_handler - add port-write message handler into the list @@ -567,22 +567,17 @@ int rio_add_mport_pw_handler(struct rio_mport *mport, void *context, int (*pwcback)(struct rio_mport *mport, void *context, union rio_pw_msg *msg, int step)) { - int rc = 0; - struct rio_pwrite *pwrite; + struct rio_pwrite *pwrite = kzalloc(sizeof(*pwrite), GFP_KERNEL); - pwrite = kzalloc(sizeof(struct rio_pwrite), GFP_KERNEL); - if (!pwrite) { - rc = -ENOMEM; - goto out; - } + if (!pwrite) + return -ENOMEM; pwrite->pwcback = pwcback; pwrite->context = context; mutex_lock(&mport->lock); list_add_tail(&pwrite->node, &mport->pwrites); mutex_unlock(&mport->lock); -out: - return rc; + return 0; } EXPORT_SYMBOL_GPL(rio_add_mport_pw_handler); @@ -632,7 +627,7 @@ int rio_request_inb_pwrite(struct rio_dev *rdev, int rc = 0; spin_lock(&rio_global_list_lock); - if (rdev->pwcback != NULL) + if (rdev->pwcback) rc = -ENOMEM; else rdev->pwcback = pwcback; @@ -698,7 +693,7 @@ EXPORT_SYMBOL_GPL(rio_pw_enable); int rio_map_inb_region(struct rio_mport *mport, dma_addr_t local, u64 rbase, u32 size, u32 rflags) { - int rc = 0; + int rc; unsigned long flags; if (!mport->ops->map_inb) @@ -742,7 +737,7 @@ EXPORT_SYMBOL_GPL(rio_unmap_inb_region); int rio_map_outb_region(struct rio_mport *mport, u16 destid, u64 rbase, u32 size, u32 rflags, dma_addr_t *local) { - int rc = 0; + int rc; unsigned long flags; if (!mport->ops->map_outb) @@ -975,7 +970,7 @@ rio_chk_dev_route(struct rio_dev *rdev, struct rio_dev **nrdev, int *npnum) rdev = rdev->prev; } - if (prev == NULL) + if (!prev) goto err_out; p_port = prev->rswitch->route_table[rdev->destid]; @@ -1054,7 +1049,7 @@ rio_get_input_status(struct rio_dev *rdev, int pnum, u32 *lnkresp) RIO_MNT_REQ_CMD_IS); /* Exit if the response is not expected */ - if (lnkresp == NULL) + if (!lnkresp) return 0; checkcount = 3; @@ -1411,7 +1406,9 @@ rio_mport_get_feature(struct rio_mport * port, int local, u16 destid, ext_ftr_ptr, &ftr_header); if (RIO_GET_BLOCK_ID(ftr_header) == ftr) return ext_ftr_ptr; - if (!(ext_ftr_ptr = RIO_GET_BLOCK_PTR(ftr_header))) + + ext_ftr_ptr = RIO_GET_BLOCK_PTR(ftr_header); + if (!ext_ftr_ptr) break; } @@ -1462,6 +1459,7 @@ struct rio_dev *rio_get_asm(u16 vid, u16 did, spin_unlock(&rio_global_list_lock); return rdev; } +EXPORT_SYMBOL_GPL(rio_get_asm); /** * rio_get_device - Begin or continue searching for a RIO device by vid/did @@ -1481,6 +1479,7 @@ struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from) { return rio_get_asm(vid, did, RIO_ANY_ID, RIO_ANY_ID, from); } +EXPORT_SYMBOL_GPL(rio_get_device); /** * rio_std_route_add_entry - Add switch route table entry using standard @@ -1696,7 +1695,7 @@ int rio_route_add_entry(struct rio_dev *rdev, spin_lock(&rdev->rswitch->lock); - if (ops == NULL || ops->add_entry == NULL) { + if (!ops || !ops->add_entry) { rc = rio_std_route_add_entry(rdev->net->hport, rdev->destid, rdev->hopcount, table, route_destid, route_port); @@ -1749,7 +1748,7 @@ int rio_route_get_entry(struct rio_dev *rdev, u16 table, spin_lock(&rdev->rswitch->lock); - if (ops == NULL || ops->get_entry == NULL) { + if (!ops || !ops->get_entry) { rc = rio_std_route_get_entry(rdev->net->hport, rdev->destid, rdev->hopcount, table, route_destid, route_port); @@ -1797,7 +1796,7 @@ int rio_route_clr_table(struct rio_dev *rdev, u16 table, int lock) spin_lock(&rdev->rswitch->lock); - if (ops == NULL || ops->clr_table == NULL) { + if (!ops || !ops->clr_table) { rc = rio_std_route_clr_table(rdev->net->hport, rdev->destid, rdev->hopcount, table); } else if (try_module_get(ops->owner)) { @@ -1889,7 +1888,7 @@ struct dma_async_tx_descriptor *rio_dma_prep_xfer(struct dma_chan *dchan, { struct rio_dma_ext rio_ext; - if (dchan->device->device_prep_slave_sg == NULL) { + if (!dchan->device->device_prep_slave_sg) { pr_err("%s: prep_rio_sg == NULL\n", __func__); return NULL; } @@ -2189,7 +2188,6 @@ int rio_init_mports(void) work = kcalloc(n, sizeof *work, GFP_KERNEL); if (!work) { - pr_err("RIO: no memory for work struct\n"); destroy_workqueue(rio_wq); goto no_disc; } @@ -2216,6 +2214,7 @@ no_disc: return 0; } +EXPORT_SYMBOL_GPL(rio_init_mports); static int rio_get_hdid(int index) { @@ -2330,16 +2329,3 @@ int rio_unregister_mport(struct rio_mport *port) return 0; } EXPORT_SYMBOL_GPL(rio_unregister_mport); - -EXPORT_SYMBOL_GPL(rio_local_get_device_id); -EXPORT_SYMBOL_GPL(rio_get_device); -EXPORT_SYMBOL_GPL(rio_get_asm); -EXPORT_SYMBOL_GPL(rio_request_inb_dbell); -EXPORT_SYMBOL_GPL(rio_release_inb_dbell); -EXPORT_SYMBOL_GPL(rio_request_outb_dbell); -EXPORT_SYMBOL_GPL(rio_release_outb_dbell); -EXPORT_SYMBOL_GPL(rio_request_inb_mbox); -EXPORT_SYMBOL_GPL(rio_release_inb_mbox); -EXPORT_SYMBOL_GPL(rio_request_outb_mbox); -EXPORT_SYMBOL_GPL(rio_release_outb_mbox); -EXPORT_SYMBOL_GPL(rio_init_mports); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 83732fef510d..2f492dfcabde 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); + if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) + pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, + (void *)addr); + return(map_addr); } @@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED; + elf_type |= MAP_FIXED_NOREPLACE; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; @@ -939,7 +944,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= MAP_FIXED; + elf_flags |= MAP_FIXED_NOREPLACE; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -975,7 +980,7 @@ static int load_elf_binary(struct linux_binprm *bprm) load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= MAP_FIXED; + elf_flags |= MAP_FIXED_NOREPLACE; } else load_bias = 0; @@ -1234,7 +1239,7 @@ static int load_elf_library(struct file *file) (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); if (error != ELF_PAGESTART(eppnt->p_vaddr)) @@ -1599,6 +1604,8 @@ static int fill_files_note(struct memelfnote *note) /* *Estimated* file count and total data size needed */ count = current->mm->map_count; + if (count > UINT_MAX / 64) + return -EINVAL; size = count * 64; names_ofs = (2 + 3 * count) * sizeof(data[0]); @@ -44,6 +44,7 @@ /* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) +#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; @@ -375,8 +376,8 @@ restart: * unmapped. */ if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_range(mapping, - (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); @@ -538,12 +539,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, - PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } spin_lock_irq(&mapping->tree_lock); @@ -636,8 +635,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); unlock_pmd: - spin_unlock(ptl); #endif + spin_unlock(ptl); } else { if (pfn != pte_pfn(*ptep)) goto unlock_pte; @@ -1271,12 +1270,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { diff --git a/fs/dcache.c b/fs/dcache.c index 7678ba6b9f79..c99812fd54b3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -289,7 +289,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry spin_unlock(&dentry->d_lock); name->name = p->name; } else { - memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + memcpy(name->inline_name, dentry->d_iname, + dentry->d_name.len + 1); spin_unlock(&dentry->d_lock); name->name = name->inline_name; } diff --git a/fs/exec.c b/fs/exec.c index 7eb8d21bcab9..e7b69e14649f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1333,6 +1333,7 @@ void setup_new_exec(struct linux_binprm * bprm) if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* * For secureexec, reset the stack limit to sane default to diff --git a/fs/fcntl.c b/fs/fcntl.c index c7b9e0948107..e95fa0a352ea 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -418,7 +418,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_ADD_SEALS: case F_GET_SEALS: - err = shmem_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, arg); break; case F_GET_RW_HINT: case F_SET_RW_HINT: diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index e8120a282435..15e06fb552da 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -444,7 +444,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); - inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); + inode = hfsplus_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO); if (!inode) goto out; @@ -486,7 +486,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); - inode = hfsplus_new_inode(dir->i_sb, mode); + inode = hfsplus_new_inode(dir->i_sb, dir, mode); if (!inode) goto out; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a015044daa05..d9255abafb81 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -478,7 +478,8 @@ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode); +struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode); void hfsplus_delete_inode(struct inode *inode); void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 190c60efbc99..c0c8d433864f 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -354,7 +354,8 @@ static const struct file_operations hfsplus_file_operations = { .unlocked_ioctl = hfsplus_ioctl, }; -struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) +struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); @@ -364,9 +365,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) return NULL; inode->i_ino = sbi->next_cnid++; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1d458b716957..513c357c734b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -549,7 +549,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->hidden_dir) { mutex_lock(&sbi->vh_mutex); - sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + sbi->hidden_dir = hfsplus_new_inode(sb, root, S_IFDIR); if (!sbi->hidden_dir) { mutex_unlock(&sbi->vh_mutex); err = -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8a85f3f53446..8fe1b0aa2896 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -55,16 +55,6 @@ struct hugetlbfs_config { umode_t mode; }; -struct hugetlbfs_inode_info { - struct shared_policy policy; - struct inode vfs_inode; -}; - -static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) -{ - return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); -} - int sysctl_hugetlb_shm_group; enum { @@ -520,8 +510,16 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode_lock(inode); + + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + inode_unlock(inode); + return -EPERM; + } + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -539,6 +537,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; @@ -570,6 +569,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + /* * Initialize a pseudo vma as this is required by the huge page * allocation routines. If NUMA is configured, use page index @@ -660,6 +664,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); BUG_ON(!inode); @@ -668,9 +673,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) { - if (attr->ia_size & ~huge_page_mask(h)) + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; + + if (newsize & ~huge_page_mask(h)) return -EINVAL; - error = hugetlb_vmtruncate(inode, attr->ia_size); + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + error = hugetlb_vmtruncate(inode, newsize); if (error) return error; } @@ -722,6 +734,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, @@ -729,6 +743,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; + info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6c5009cc4e6f..68cb9e4740b4 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -130,7 +130,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, } int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags, - time_t ctime, __u64 cno) + time64_t ctime, __u64 cno) { int err; diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 7bbccc099709..10e16935fff6 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -46,7 +46,7 @@ struct nilfs_segsum_info { unsigned long nfileblk; u64 seg_seq; __u64 cno; - time_t ctime; + time64_t ctime; sector_t next; }; @@ -120,7 +120,7 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); -int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t, +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time64_t, __u64); int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9f3ffba41533..0953635e7d48 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2040,7 +2040,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) goto out; /* Update time stamp */ - sci->sc_seg_ctime = get_seconds(); + sci->sc_seg_ctime = ktime_get_real_seconds(); err = nilfs_segctor_collect(sci, nilfs, mode); if (unlikely(err)) diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 84084a4d9b3e..04634e3e3d58 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -157,7 +157,7 @@ struct nilfs_sc_info { unsigned long sc_blk_cnt; unsigned long sc_datablk_cnt; unsigned long sc_nblk_this_inc; - time_t sc_seg_ctime; + time64_t sc_seg_ctime; __u64 sc_cno; unsigned long sc_flags; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1341a41e7b43..c7fa139d50e8 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -526,7 +526,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) * @modtime: modification time (option) */ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, - unsigned long nblocks, time_t modtime) + unsigned long nblocks, time64_t modtime) { struct buffer_head *bh; struct nilfs_segment_usage *su; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 158a9190c8ec..673a891350f4 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -35,7 +35,7 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); int nilfs_sufile_alloc(struct inode *, __u64 *); int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, - unsigned long nblocks, time_t modtime); + unsigned long nblocks, time64_t modtime); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int, size_t); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3073b646e1ba..6ffeca84d7c3 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -283,10 +283,10 @@ int nilfs_commit_super(struct super_block *sb, int flag) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp = nilfs->ns_sbp; - time_t t; + time64_t t; /* nilfs->ns_sem must be locked by the caller. */ - t = get_seconds(); + t = ktime_get_real_seconds(); nilfs->ns_sbwtime = t; sbp[0]->s_wtime = cpu_to_le64(t); sbp[0]->s_sum = 0; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 490303e3d517..4b25837e7724 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -31,7 +31,7 @@ static struct kset *nilfs_kset; #define NILFS_SHOW_TIME(time_t_val, buf) ({ \ struct tm res; \ int count = 0; \ - time_to_tm(time_t_val, 0, &res); \ + time64_to_tm(time_t_val, 0, &res); \ res.tm_year += 1900; \ res.tm_mon += 1; \ count = scnprintf(buf, PAGE_SIZE, \ @@ -579,7 +579,7 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t ctime; + time64_t ctime; down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; @@ -593,13 +593,13 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t ctime; + time64_t ctime; down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); + return snprintf(buf, PAGE_SIZE, "%llu\n", ctime); } static ssize_t @@ -607,7 +607,7 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t nongc_ctime; + time64_t nongc_ctime; down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; @@ -621,14 +621,13 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t nongc_ctime; + time64_t nongc_ctime; down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)nongc_ctime); + return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime); } static ssize_t @@ -728,7 +727,7 @@ nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t sbwtime; + time64_t sbwtime; down_read(&nilfs->ns_sem); sbwtime = nilfs->ns_sbwtime; @@ -742,13 +741,13 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t sbwtime; + time64_t sbwtime; down_read(&nilfs->ns_sem); sbwtime = nilfs->ns_sbwtime; up_read(&nilfs->ns_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime); + return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime); } static ssize_t diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 883d732b0259..36da1779f976 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -116,7 +116,7 @@ struct the_nilfs { */ struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; - time_t ns_sbwtime; + time64_t ns_sbwtime; unsigned int ns_sbwcount; unsigned int ns_sbsize; unsigned int ns_mount_state; @@ -131,8 +131,8 @@ struct the_nilfs { __u64 ns_nextnum; unsigned long ns_pseg_offset; __u64 ns_cno; - time_t ns_ctime; - time_t ns_nongc_ctime; + time64_t ns_ctime; + time64_t ns_nongc_ctime; atomic_t ns_ndirtyblks; /* @@ -267,7 +267,7 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { - u64 t = get_seconds(); + u64 t = ktime_get_real_seconds(); return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 40b5cc97f7b0..917fadca8a7b 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -311,7 +311,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) if (had_lock < 0) return ERR_PTR(had_lock); + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, type, di_bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -330,7 +332,9 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return 0; + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); @@ -361,8 +365,10 @@ int ocfs2_init_acl(handle_t *handle, if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, dir_bh); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ab5105f9767e..a85de734eafa 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); + +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given); +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et); + static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, if (!obj) obj = (void *)bh->b_data; et->et_object = obj; + et->et_dealloc = NULL; et->et_ops->eo_fill_root_el(et); if (!et->et_ops->eo_fill_max_leaf_clusters) @@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { - int status, new_blocks, i; + int status, new_blocks, i, block_given = 0; u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; @@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle, goto bail; } - status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, - meta_ac, new_eb_bhs); - if (status < 0) { - mlog_errno(status); - goto bail; + /* Firstyly, try to reuse dealloc since we have already estimated how + * many extent blocks we may use. + */ + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + new_eb_bhs, new_blocks, + &block_given); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + BUG_ON(block_given > new_blocks); + + if (block_given < new_blocks) { + BUG_ON(!meta_ac); + status = ocfs2_create_new_meta_bhs(handle, et, + new_blocks - block_given, + meta_ac, + &new_eb_bhs[block_given]); + if (status < 0) { + mlog_errno(status); + goto bail; + } } /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be @@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { - int status, i; + int status, i, block_given = 0; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, - &new_eb_bh); + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + &new_eb_bh, 1, + &block_given); + } else if (meta_ac) { + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, + &new_eb_bh); + + } else { + BUG(); + } + if (status < 0) { mlog_errno(status); goto bail; @@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, int depth = le16_to_cpu(el->l_tree_depth); struct buffer_head *bh = NULL; - BUG_ON(meta_ac == NULL); + BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et)); shift = ocfs2_find_branch_target(et, &bh); if (shift < 0) { @@ -2598,11 +2636,8 @@ static void ocfs2_unlink_subtree(handle_t *handle, int i; struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; - struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; - el = path_leaf_el(left_path); - eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) @@ -3938,7 +3973,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec) { - int ret, i, next_free; + int i, next_free; struct buffer_head *bh; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; @@ -3955,7 +3990,6 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - ret = -EIO; return; } @@ -5057,7 +5091,6 @@ int ocfs2_split_extent(handle_t *handle, struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; struct ocfs2_merge_ctxt ctxt; - struct ocfs2_extent_list *rightmost_el; if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < @@ -5093,9 +5126,7 @@ int ocfs2_split_extent(handle_t *handle, } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - rightmost_el = &eb->h_list; - } else - rightmost_el = path_root_el(path); + } if (rec->e_cpos == split_rec->e_cpos && rec->e_leaf_clusters == split_rec->e_leaf_clusters) @@ -6585,6 +6616,156 @@ ocfs2_find_per_slot_free_list(int type, return fl; } +static struct ocfs2_per_slot_free_list * +ocfs2_find_preferred_free_list(int type, + int preferred_slot, + int *real_slot, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + + while (fl) { + if (fl->f_inode_type == type && fl->f_slot == preferred_slot) { + *real_slot = fl->f_slot; + return fl; + } + + fl = fl->f_next_suballocator; + } + + /* If we can't find any free list matching preferred slot, just use + * the first one. + */ + fl = ctxt->c_first_suballocator; + *real_slot = fl->f_slot; + + return fl; +} + +/* Return Value 1 indicates empty */ +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et) +{ + struct ocfs2_per_slot_free_list *fl = NULL; + + if (!et->et_dealloc) + return 1; + + fl = et->et_dealloc->c_first_suballocator; + if (!fl) + return 1; + + if (!fl->f_first) + return 1; + + return 0; +} + +/* If extent was deleted from tree due to extent rotation and merging, and + * no metadata is reserved ahead of time. Try to reuse some extents + * just deleted. This is only used to reuse extent blocks. + * It is supposed to find enough extent blocks in dealloc if our estimation + * on metadata is accurate. + */ +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given) +{ + int i, status = 0, real_slot; + struct ocfs2_cached_dealloc_ctxt *dealloc; + struct ocfs2_per_slot_free_list *fl; + struct ocfs2_cached_block_free *bf; + struct ocfs2_extent_block *eb; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + + *blk_given = 0; + + /* If extent tree doesn't have a dealloc, this is not faulty. Just + * tell upper caller dealloc can't provide any block and it should + * ask for alloc to claim more space. + */ + dealloc = et->et_dealloc; + if (!dealloc) + goto bail; + + for (i = 0; i < blk_wanted; i++) { + /* Prefer to use local slot */ + fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE, + osb->slot_num, &real_slot, + dealloc); + /* If no more block can be reused, we should claim more + * from alloc. Just return here normally. + */ + if (!fl) { + status = 0; + break; + } + + bf = fl->f_first; + fl->f_first = bf->free_next; + + new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk); + if (new_eb_bh[i] == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + mlog(0, "Reusing block(%llu) from " + "dealloc(local slot:%d, real slot:%d)\n", + bf->free_blk, osb->slot_num, real_slot); + + ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]); + + status = ocfs2_journal_access_eb(handle, et->et_ci, + new_eb_bh[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data; + + /* We can't guarantee that buffer head is still cached, so + * polutlate the extent block again. + */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(bf->free_blk); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + eb->h_suballoc_slot = cpu_to_le16(real_slot); + eb->h_suballoc_loc = cpu_to_le64(bf->free_bg); + eb->h_suballoc_bit = cpu_to_le16(bf->free_bit); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. + */ + ocfs2_journal_dirty(handle, new_eb_bh[i]); + + if (!fl->f_first) { + dealloc->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + kfree(bf); + } + + *blk_given = i; + +bail: + if (unlikely(status)) { + for (; i >= 0; i--) { + if (new_eb_bh[i]) + brelse(new_eb_bh[i]); + } + } + + return status; +} + int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, int type, int slot, u64 suballoc, u64 blkno, unsigned int bit) @@ -7382,6 +7563,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) struct buffer_head *gd_bh = NULL; struct ocfs2_dinode *main_bm; struct ocfs2_group_desc *gd = NULL; + struct ocfs2_trim_fs_info info, *pinfo = NULL; start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; @@ -7419,6 +7601,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) trace_ocfs2_trim_fs(start, len, minlen); + ocfs2_trim_fs_lock_res_init(osb); + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + goto out_unlock; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + goto out_unlock; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == start && info.tf_len == len && + info.tf_minlen == minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out_trimunlock; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = start; + info.tf_len = len; + info.tf_minlen = minlen; + /* Determine first and last group to examine based on start and len */ first_group = ocfs2_which_cluster_group(main_bm_inode, start); if (first_group == osb->first_cluster_group_blkno) @@ -7463,6 +7681,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); } range->len = trimmed * sb->s_blocksize; + + info.tf_trimlen = range->len; + info.tf_success = (ret ? 0 : 1); + pinfo = &info; +out_trimunlock: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 27b75cf32cfa..250bcacdf9e9 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -61,6 +61,7 @@ struct ocfs2_extent_tree { ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; + struct ocfs2_cached_dealloc_ctxt *et_dealloc; }; /* diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index d1516327b787..e8e205bf2e41 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt { struct ocfs2_cached_dealloc_ctxt w_dealloc; struct list_head w_unwritten_list; + unsigned int w_unwritten_count; }; void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) @@ -1386,6 +1387,7 @@ retry: desc->c_clear_unwritten = 0; list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); list_add_tail(&new->ue_node, &wc->w_unwritten_list); + wc->w_unwritten_count++; new = NULL; unlock: spin_unlock(&oi->ip_lock); @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, ue->ue_phys = desc->c_phys; list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); - dwc->dw_zero_count++; + dwc->dw_zero_count += wc->w_unwritten_count; } ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); @@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode, ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + /* Attach dealloc with extent tree in case that we may reuse extents + * which are already unlinked from current extent tree due to extent + * rotation and merging. + */ + et.et_dealloc = &dealloc; + ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, &data_ac, &meta_ac); if (ret) { diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 62e8ec619b4c..af2e7473956e 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -314,12 +314,13 @@ void o2quo_conn_err(u8 node) node, qs->qs_connected); clear_bit(node, qs->qs_conn_bm); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); } mlog(0, "node %u, %d total\n", node, qs->qs_connected); - if (test_bit(node, qs->qs_hb_bm)) - o2quo_set_hold(qs, node); spin_unlock(&qs->qs_lock); } diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..0276f7f8d5e6 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -196,7 +196,7 @@ struct o2net_msg_handler { u32 nh_msg_type; u32 nh_key; o2net_msg_handler_func *nh_func; - o2net_msg_handler_func *nh_func_data; + void *nh_func_data; o2net_post_msg_handler_func *nh_post_func; struct kref nh_kref; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 32f9c72dff17..b7520e20a770 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1958,7 +1958,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9c3e0f13ca87..a7df226f9449 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1122,13 +1122,6 @@ recheck: /* sleep if we haven't finished voting yet */ if (sleep) { unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); - - /* - if (kref_read(&mle->mle_refs) < 2) - mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, - kref_read(&mle->mle_refs), - res->lockname.len, res->lockname.name); - */ atomic_set(&mle->woken, 0); (void)wait_event_timeout(mle->wq, (atomic_read(&mle->woken) == 1), diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4689940a953c..e4b9085a657a 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -259,6 +259,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; @@ -676,6 +680,24 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, + &ocfs2_trim_fs_lops, osb); +} + +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_simple_drop_lockres(osb, lockres); + ocfs2_lock_res_free(lockres); +} + static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, struct ocfs2_super *osb) { @@ -1742,6 +1764,27 @@ int ocfs2_rw_lock(struct inode *inode, int write) return status; } +int ocfs2_try_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu try to take %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); + return status; +} + void ocfs2_rw_unlock(struct inode *inode, int write) { int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; @@ -2486,6 +2529,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); + /* + * If we can't get inode lock immediately, we should not return + * directly here, since this will lead to a softlockup problem. + * The method is to get a blocking lock and immediately unlock + * before returning, this can avoid CPU resource waste due to + * lots of retries, and benefits fairness in getting lock. + */ + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) + ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } @@ -2494,13 +2546,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode, int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level) + int *level, int wait) { int ret; - ret = ocfs2_inode_lock(inode, NULL, 0); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, 0); + else + ret = ocfs2_try_inode_lock(inode, NULL, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } @@ -2512,9 +2569,14 @@ int ocfs2_inode_lock_atime(struct inode *inode, struct buffer_head *bh = NULL; ocfs2_inode_unlock(inode, 0); - ret = ocfs2_inode_lock(inode, &bh, 1); + if (wait) + ret = ocfs2_inode_lock(inode, &bh, 1); + else + ret = ocfs2_try_inode_lock(inode, &bh, 1); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } *level = 1; @@ -2745,6 +2807,70 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) ex ? LKM_EXMODE : LKM_PRMODE); } +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock) +{ + int status; + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (info) + info->tf_valid = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, + trylock ? DLM_LKF_NOQUEUE : 0, 0); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + return status; + } + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) { + info->tf_valid = 1; + info->tf_success = lvb->lvb_success; + info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum); + info->tf_start = be64_to_cpu(lvb->lvb_start); + info->tf_len = be64_to_cpu(lvb->lvb_len); + info->tf_minlen = be64_to_cpu(lvb->lvb_minlen); + info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen); + } + } + + return status; +} + +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info) +{ + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (ocfs2_mount_local(osb)) + return; + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION; + lvb->lvb_success = info->tf_success; + lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum); + lvb->lvb_start = cpu_to_be64(info->tf_start); + lvb->lvb_len = cpu_to_be64(info->tf_len); + lvb->lvb_minlen = cpu_to_be64(info->tf_minlen); + lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen); + } + + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); +} + int ocfs2_dentry_lock(struct dentry *dentry, int ex) { int ret; @@ -3413,7 +3539,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, * we can recover correctly from node failure. Otherwise, we may get * invalid LVB in LKB, but without DLM_SBF_VALNOTVALIDÂ being set. */ - if (!ocfs2_is_o2cb_active() && + if (ocfs2_userspace_stack(osb) && lockres->l_ops->flags & LOCK_TYPE_USES_LVB) lvb = 1; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index a7fc18ba0dc1..256e0a9067b8 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -70,6 +70,29 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +#define OCFS2_TRIMFS_LVB_VERSION 1 + +struct ocfs2_trim_fs_lvb { + __u8 lvb_version; + __u8 lvb_success; + __u8 lvb_reserved[2]; + __be32 lvb_nodenum; + __be64 lvb_start; + __be64 lvb_len; + __be64 lvb_minlen; + __be64 lvb_trimlen; +}; + +struct ocfs2_trim_fs_info { + u8 tf_valid; /* lvb is valid, or not */ + u8 tf_success; /* trim is successful, or not */ + u32 tf_nodenum; /* osb node number */ + u64 tf_start; /* trim start offset in clusters */ + u64 tf_len; /* trim end offset in clusters */ + u64 tf_minlen; /* trim minimum contiguous free clusters */ + u64 tf_trimlen; /* trimmed length in bytes */ +}; + struct ocfs2_lock_holder { struct list_head oh_list; struct pid *oh_owner_pid; @@ -116,13 +139,14 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); int ocfs2_rw_lock(struct inode *inode, int write); +int ocfs2_try_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); int ocfs2_open_lock(struct inode *inode); int ocfs2_try_open_lock(struct inode *inode, int write); void ocfs2_open_unlock(struct inode *inode); int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level); + int *level, int wait); int ocfs2_inode_lock_full_nested(struct inode *inode, struct buffer_head **ret_bh, int ex, @@ -140,6 +164,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode, /* 99% of the time we don't want to supply any additional flags -- * those are for very specific cases only. */ #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL) +#define ocfs2_try_inode_lock(i, b, e)\ + ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\ + OI_LS_NORMAL) void ocfs2_inode_unlock(struct inode *inode, int ex); int ocfs2_super_lock(struct ocfs2_super *osb, @@ -153,6 +180,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb); +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb); +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock); +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info); int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e4719e0a3f99..06cb96462bf9 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -38,6 +38,7 @@ #include "inode.h" #include "super.h" #include "symlink.h" +#include "aops.h" #include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -832,6 +833,50 @@ out: return ret; } +/* Is IO overwriting allocated blocks? */ +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len) +{ + int ret = 0, is_last; + u32 mapping_end, cpos; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len)) + return ret; + else + return -EAGAIN; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + is_last = 0; + while (cpos < mapping_end && !is_last) { + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + NULL, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) + break; + + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + break; + + cpos = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + } + + if (cpos < mapping_end) + ret = -EAGAIN; +out: + return ret; +} + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) { struct inode *inode = file->f_mapping->host; diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 67ea57d2fd59..1057586ec19f 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -53,6 +53,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len); + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a1d051055472..5d1784a365a3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -140,6 +140,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) spin_unlock(&oi->ip_lock); } + file->f_mode |= FMODE_NOWAIT; + leave: return status; } @@ -2132,12 +2134,12 @@ out: } static int ocfs2_prepare_inode_for_write(struct file *file, - loff_t pos, - size_t count) + loff_t pos, size_t count, int wait) { - int ret = 0, meta_level = 0; + int ret = 0, meta_level = 0, overwrite_io = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); + struct buffer_head *di_bh = NULL; loff_t end; /* @@ -2145,13 +2147,40 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * if we need to make modifications here. */ for(;;) { - ret = ocfs2_inode_lock(inode, NULL, meta_level); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, meta_level); + else + ret = ocfs2_try_inode_lock(inode, + overwrite_io ? NULL : &di_bh, meta_level); if (ret < 0) { meta_level = -1; - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } + /* + * Check if IO will overwrite allocated blocks in case + * IOCB_NOWAIT flag is set. + */ + if (!wait && !overwrite_io) { + overwrite_io = 1; + if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = ocfs2_overwrite_io(inode, di_bh, pos, count); + brelse(di_bh); + di_bh = NULL; + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out_unlock; + } + } + /* Clear suid / sgid if necessary. We do this here * instead of later in the write path because * remove_suid() calls ->setattr without any hint that @@ -2199,7 +2228,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - pos, count); + pos, count, wait); + + brelse(di_bh); if (meta_level >= 0) ocfs2_inode_unlock(inode, meta_level); @@ -2211,7 +2242,7 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int direct_io, rw_level; + int rw_level; ssize_t written = 0; ssize_t ret; size_t count = iov_iter_count(from); @@ -2223,6 +2254,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, void *saved_ki_complete = NULL; int append_write = ((iocb->ki_pos + count) >= i_size_read(inode) ? 1 : 0); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2230,12 +2263,17 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)from->nr_segs); /* GRRRRR */ + if (!direct_io && nowait) + return -EOPNOTSUPP; + if (count == 0) return 0; - direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - - inode_lock(inode); + if (nowait) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else + inode_lock(inode); /* * Concurrent O_DIRECT writes are allowed with @@ -2244,9 +2282,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, */ rw_level = (!direct_io || full_coherency || append_write); - ret = ocfs2_rw_lock(inode, rw_level); + if (nowait) + ret = ocfs2_try_rw_lock(inode, rw_level); + else + ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out_mutex; } @@ -2260,9 +2302,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, * other nodes to drop their caches. Buffered I/O * already does this in write_begin(). */ - ret = ocfs2_inode_lock(inode, NULL, 1); + if (nowait) + ret = ocfs2_try_inode_lock(inode, NULL, 1); + else + ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } @@ -2277,9 +2323,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } count = ret; - ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } @@ -2355,6 +2402,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, int ret = 0, rw_level = -1, lock_level = 0; struct file *filp = iocb->ki_filp; struct inode *inode = file_inode(filp); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2369,14 +2418,22 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, goto bail; } + if (!direct_io && nowait) + return -EOPNOTSUPP; + /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. */ - if (iocb->ki_flags & IOCB_DIRECT) { - ret = ocfs2_rw_lock(inode, 0); + if (direct_io) { + if (nowait) + ret = ocfs2_try_rw_lock(inode, 0); + else + ret = ocfs2_rw_lock(inode, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } rw_level = 0; @@ -2393,9 +2450,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * like i_size. This allows the checks down below * generic_file_aio_read() a chance of actually working. */ - ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, + !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } ocfs2_inode_unlock(inode, lock_level); diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index e87279e49ba3..f65f2b2f594d 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -53,36 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = { "UNSUPPORTED" }; -static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); -static LIST_HEAD(ocfs2_filecheck_sysfs_list); - -struct ocfs2_filecheck { - struct list_head fc_head; /* File check entry list head */ - spinlock_t fc_lock; - unsigned int fc_max; /* Maximum number of entry in list */ - unsigned int fc_size; /* Current entry count in list */ - unsigned int fc_done; /* Finished entry count in list */ -}; - -struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ - struct list_head fs_list; - atomic_t fs_count; - struct super_block *fs_sb; - struct kset *fs_devicekset; - struct kset *fs_fcheckkset; - struct ocfs2_filecheck *fs_fcheck; -}; - -#define OCFS2_FILECHECK_MAXSIZE 100 -#define OCFS2_FILECHECK_MINSIZE 10 - -/* File check operation type */ -enum { - OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ - OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ - OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ -}; - struct ocfs2_filecheck_entry { struct list_head fe_list; unsigned long fe_ino; @@ -110,34 +80,84 @@ ocfs2_filecheck_error(int errno) return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf); -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count); -static struct kobj_attribute ocfs2_attr_filecheck_chk = +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_filecheck_attr_chk = __ATTR(check, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_fix = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_fix = __ATTR(fix, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_set = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_set = __ATTR(set, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct attribute *ocfs2_filecheck_attrs[] = { + &ocfs2_filecheck_attr_chk.attr, + &ocfs2_filecheck_attr_fix.attr, + &ocfs2_filecheck_attr_set.attr, + NULL +}; + +static void ocfs2_filecheck_release(struct kobject *kobj) +{ + struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + complete(&entry->fs_kobj_unregister); +} + +static ssize_t +ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + kobject_put(kobj); + return ret; +} + +static ssize_t +ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + kobject_put(kobj); + return ret; +} + +static const struct sysfs_ops ocfs2_filecheck_ops = { + .show = ocfs2_filecheck_show, + .store = ocfs2_filecheck_store, +}; + +static struct kobj_type ocfs2_ktype_filecheck = { + .default_attrs = ocfs2_filecheck_attrs, + .sysfs_ops = &ocfs2_filecheck_ops, + .release = ocfs2_filecheck_release, +}; static void ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) { struct ocfs2_filecheck_entry *p; - if (!atomic_dec_and_test(&entry->fs_count)) - wait_on_atomic_t(&entry->fs_count, atomic_t_wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&entry->fs_fcheck->fc_lock); while (!list_empty(&entry->fs_fcheck->fc_head)) { p = list_first_entry(&entry->fs_fcheck->fc_head, @@ -148,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) } spin_unlock(&entry->fs_fcheck->fc_lock); - kset_unregister(entry->fs_fcheckkset); - kset_unregister(entry->fs_devicekset); kfree(entry->fs_fcheck); - kfree(entry); -} - -static void -ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) -{ - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); + entry->fs_fcheck = NULL; } -static int ocfs2_filecheck_sysfs_del(const char *devname) +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb) { - struct ocfs2_filecheck_sysfs_entry *p; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - list_del(&p->fs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - ocfs2_filecheck_sysfs_free(p); - return 0; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return 1; -} - -static void -ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) -{ - if (atomic_dec_and_test(&entry->fs_count)) - wake_up_atomic_t(&entry->fs_count); -} - -static struct ocfs2_filecheck_sysfs_entry * -ocfs2_filecheck_sysfs_get(const char *devname) -{ - struct ocfs2_filecheck_sysfs_entry *p = NULL; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - atomic_inc(&p->fs_count); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return p; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return NULL; -} - -int ocfs2_filecheck_create_sysfs(struct super_block *sb) -{ - int ret = 0; - struct kset *device_kset = NULL; - struct kset *fcheck_kset = NULL; - struct ocfs2_filecheck *fcheck = NULL; - struct ocfs2_filecheck_sysfs_entry *entry = NULL; - struct attribute **attrs = NULL; - struct attribute_group attrgp; - - if (!ocfs2_kset) - return -ENOMEM; - - attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); - if (!attrs) { - ret = -ENOMEM; - goto error; - } else { - attrs[0] = &ocfs2_attr_filecheck_chk.attr; - attrs[1] = &ocfs2_attr_filecheck_fix.attr; - attrs[2] = &ocfs2_attr_filecheck_set.attr; - attrs[3] = NULL; - memset(&attrgp, 0, sizeof(attrgp)); - attrgp.attrs = attrs; - } + int ret; + struct ocfs2_filecheck *fcheck; + struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent; fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); - if (!fcheck) { - ret = -ENOMEM; - goto error; - } else { - INIT_LIST_HEAD(&fcheck->fc_head); - spin_lock_init(&fcheck->fc_lock); - fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; - fcheck->fc_size = 0; - fcheck->fc_done = 0; - } - - if (strlen(sb->s_id) <= 0) { - mlog(ML_ERROR, - "Cannot get device basename when create filecheck sysfs\n"); - ret = -ENODEV; - goto error; - } - - device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); - if (!device_kset) { - ret = -ENOMEM; - goto error; - } + if (!fcheck) + return -ENOMEM; - fcheck_kset = kset_create_and_add("filecheck", NULL, - &device_kset->kobj); - if (!fcheck_kset) { - ret = -ENOMEM; - goto error; + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + + entry->fs_kobj.kset = osb->osb_dev_kset; + init_completion(&entry->fs_kobj_unregister); + ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck, + NULL, "filecheck"); + if (ret) { + kfree(fcheck); + return ret; } - ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); - if (ret) - goto error; - - entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); - if (!entry) { - ret = -ENOMEM; - goto error; - } else { - atomic_set(&entry->fs_count, 1); - entry->fs_sb = sb; - entry->fs_devicekset = device_kset; - entry->fs_fcheckkset = fcheck_kset; - entry->fs_fcheck = fcheck; - ocfs2_filecheck_sysfs_add(entry); - } - - kfree(attrs); + entry->fs_fcheck = fcheck; return 0; - -error: - kfree(attrs); - kfree(entry); - kfree(fcheck); - kset_unregister(fcheck_kset); - kset_unregister(device_kset); - return ret; } -int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb) { - return ocfs2_filecheck_sysfs_del(sb->s_id); + if (!osb->osb_fc_ent.fs_fcheck) + return; + + kobject_del(&osb->osb_fc_ent.fs_kobj); + kobject_put(&osb->osb_fc_ent.fs_kobj); + wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister); + ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent); } static int @@ -309,7 +226,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, spin_lock(&ent->fs_fcheck->fc_lock); if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { - mlog(ML_ERROR, + mlog(ML_NOTICE, "Cannot set online file check maximum entry number " "to %u due to too many pending entries(%u)\n", len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); @@ -386,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, return 0; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -394,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, ssize_t ret = 0, total = 0, remain = PAGE_SIZE; unsigned int type; struct ocfs2_filecheck_entry *p; - struct ocfs2_filecheck_sysfs_entry *ent; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) return -EINVAL; - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->name); - return -ENODEV; - } - if (type == OCFS2_FILECHECK_TYPE_SET) { spin_lock(&ent->fs_fcheck->fc_lock); total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); @@ -440,11 +350,26 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, spin_unlock(&ent->fs_fcheck->fc_lock); exit: - ocfs2_filecheck_sysfs_put(ent); return total; } -static int +static inline int +ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned long ino) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (!p->fe_done) { + if (p->fe_ino == ino) + return 1; + } + } + + return 0; +} + +static inline int ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) { struct ocfs2_filecheck_entry *p; @@ -483,21 +408,21 @@ static void ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { - entry->fe_done = 1; spin_lock(&ent->fs_fcheck->fc_lock); + entry->fe_done = 1; ent->fs_fcheck->fc_done++; spin_unlock(&ent->fs_fcheck->fc_lock); } static unsigned int -ocfs2_filecheck_handle(struct super_block *sb, +ocfs2_filecheck_handle(struct ocfs2_super *osb, unsigned long ino, unsigned int flags) { unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; struct inode *inode = NULL; int rc; - inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + inode = ocfs2_iget(osb, ino, flags, 0); if (IS_ERR(inode)) { rc = (int)(-(long)inode); if (rc >= OCFS2_FILECHECK_ERR_START && @@ -515,11 +440,14 @@ static void ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { + struct ocfs2_super *osb = container_of(ent, struct ocfs2_super, + osb_fc_ent); + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); else entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; @@ -527,30 +455,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, ocfs2_filecheck_done_entry(ent, entry); } -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + ssize_t ret = 0; struct ocfs2_filecheck_args args; struct ocfs2_filecheck_entry *entry; - struct ocfs2_filecheck_sysfs_entry *ent; - ssize_t ret = 0; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (count == 0) return count; - if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { - mlog(ML_ERROR, "Invalid arguments for online file check\n"); + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) return -EINVAL; - } - - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->parent->name); - return -ENODEV; - } if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); @@ -564,13 +483,16 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, } spin_lock(&ent->fs_fcheck->fc_lock); - if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && - (ent->fs_fcheck->fc_done == 0)) { - mlog(ML_ERROR, + if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) { + ret = -EEXIST; + kfree(entry); + } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_NOTICE, "Cannot do more file check " "since file check queue(%u) is full now\n", ent->fs_fcheck->fc_max); - ret = -EBUSY; + ret = -EAGAIN; kfree(entry); } else { if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && @@ -595,6 +517,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - ocfs2_filecheck_sysfs_put(ent); return (!ret ? count : ret); } diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index e5cd002a2c09..6a22ee79e8d0 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -43,7 +43,32 @@ enum { #define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED #define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED -int ocfs2_filecheck_create_sysfs(struct super_block *sb); -int ocfs2_filecheck_remove_sysfs(struct super_block *sb); +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */ + struct kobject fs_kobj; + struct completion fs_kobj_unregister; + struct ocfs2_filecheck *fs_fcheck; +}; + + +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb); +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb); #endif /* FILECHECK_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 36304434eacf..d769ca29e32f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -666,21 +666,46 @@ static int __ocfs2_journal_access(handle_t *handle, /* we can safely remove this assertion after testing. */ if (!buffer_uptodate(bh)) { mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); - mlog(ML_ERROR, "b_blocknr=%llu\n", - (unsigned long long)bh->b_blocknr); + mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n", + (unsigned long long)bh->b_blocknr, bh->b_state); lock_buffer(bh); /* - * A previous attempt to write this buffer head failed. - * Nothing we can do but to retry the write and hope for - * the best. + * We should not reuse the dirty bh directly due to the + * following situation: + * + * 1. When removing extent rec, we will dirty the bhs of + * extent rec and truncate log at the same time, and + * hand them over to jbd2. + * 2. The bhs are not flushed to disk due to abnormal + * storage link. + * 3. After a while the storage link become normal. + * Truncate log flush worker triggered by the next + * space reclaiming found the dirty bh of truncate log + * and clear its 'BH_Write_EIO' and then set it uptodate + * in __ocfs2_journal_access(): + * + * ocfs2_truncate_log_worker + * ocfs2_flush_truncate_log + * __ocfs2_flush_truncate_log + * ocfs2_replay_truncate_records + * ocfs2_journal_access_di + * __ocfs2_journal_access + * + * 4. Then jbd2 will flush the bh of truncate log to disk, + * but the bh of extent rec is still in error state, and + * unfortunately nobody will take care of it. + * 5. At last the space of extent rec was not reduced, + * but truncate log flush worker have given it back to + * globalalloc. That will cause duplicate cluster problem + * which could be identified by fsck.ocfs2. + * + * So we should return -EIO in case of ruining atomicity + * and consistency of space reclaim. */ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - - if (!buffer_uptodate(bh)) { + mlog(ML_ERROR, "A previous attempt to write this " + "buffer head failed\n"); unlock_buffer(bh); return -EIO; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 098f5c712569..fb9a20e3d608 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -184,7 +184,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) int ret = 0, lock_level = 0; ret = ocfs2_inode_lock_atime(file_inode(file), - file->f_path.mnt, &lock_level); + file->f_path.mnt, &lock_level, 1); if (ret < 0) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9a50f222ac97..4f86ac0027b5 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -50,6 +50,8 @@ #include "reservations.h" +#include "filecheck.h" + /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of @@ -404,6 +406,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; + struct ocfs2_lock_res osb_trim_fs_lockres; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; @@ -471,6 +474,12 @@ struct ocfs2_super * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; + + /* sysfs directory per partition */ + struct kset *osb_dev_kset; + + /* file check related stuff */ + struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index d277aabf5dfb..7051b994c776 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -50,6 +50,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_LOCK_TYPE_REFCOUNT, + OCFS2_LOCK_TYPE_TRIM_FS, OCFS2_NUM_LOCK_TYPES }; @@ -93,6 +94,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_REFCOUNT: c = 'T'; break; + case OCFS2_LOCK_TYPE_TRIM_FS: + c = 'I'; + break; default: c = '\0'; } @@ -115,6 +119,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", + [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index a0b5d00ef0a9..e2a11aaece10 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1449,20 +1449,22 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); TRACE_EVENT(ocfs2_prepare_inode_for_write, TP_PROTO(unsigned long long ino, unsigned long long saved_pos, - unsigned long count), - TP_ARGS(ino, saved_pos, count), + unsigned long count, int wait), + TP_ARGS(ino, saved_pos, count, wait), TP_STRUCT__entry( __field(unsigned long long, ino) __field(unsigned long long, saved_pos) __field(unsigned long, count) + __field(int, wait) ), TP_fast_assign( __entry->ino = ino; __entry->saved_pos = saved_pos; __entry->count = count; + __entry->wait = wait; ), - TP_printk("%llu %llu %lu", __entry->ino, - __entry->saved_pos, __entry->count) + TP_printk("%llu %llu %lu %d", __entry->ino, + __entry->saved_pos, __entry->count, __entry->wait) ); DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index d6c350ba25b9..c4b029c43464 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; -inline int ocfs2_is_o2cb_active(void) -{ - return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); -} -EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); - static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index e3036e1790e8..f2dce10fae54 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); -/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ -int ocfs2_is_o2cb_active(void); - extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 9f0b95abc09f..d8f5f6ce99dc 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2438,6 +2438,8 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + if (undo_fn) + jbd_unlock_bh_state(group_bh); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), @@ -2563,16 +2565,16 @@ static int _ocfs2_free_clusters(handle_t *handle, int status; u16 bg_start_bit; u64 bg_blkno; - struct ocfs2_dinode *fe; /* You can't ever have a contiguous set of clusters * bigger than a block group bitmap so we never have to worry * about looping on them. * This is expensive. We can safely remove once this stuff has * gotten tested really well. */ - BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, + ocfs2_blocks_to_clusters(bitmap_inode->i_sb, + start_blk))); - fe = (struct ocfs2_dinode *) bitmap_bh->b_data; ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, &bg_start_bit); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 80efa5699fb0..342b240aa255 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -474,9 +474,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog_errno(status); - /* FIXME: Should ERROR_RO_FS */ mlog(ML_ERROR, "Unable to load system inode %d, " "possibly corrupt fs?", i); goto bail; @@ -505,7 +504,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", status, i, osb->slot_num); goto bail; @@ -1162,6 +1161,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_complete_mount_recovery(osb); + osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, + &ocfs2_kset->kobj); + if (!osb->osb_dev_kset) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); + goto read_super_error; + } + + /* Create filecheck sysfs related directories/files at + * /sys/fs/ocfs2/<devname>/filecheck */ + if (ocfs2_filecheck_create_sysfs(osb)) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " + "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); + goto read_super_error; + } + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else @@ -1200,22 +1216,20 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); - /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */ - ocfs2_filecheck_create_sysfs(sb); - return status; read_super_error: brelse(bh); + if (status) + mlog_errno(status); + if (osb) { atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); ocfs2_dismount_volume(sb, 1); } - if (status) - mlog_errno(status); return status; } @@ -1653,7 +1667,6 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); - ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1843,6 +1856,9 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); + if (status == -EBADR && ocfs2_userspace_stack(osb)) + mlog(ML_ERROR, "couldn't mount because cluster name on" + " disk does not match the running cluster name.\n"); goto leave; } @@ -1896,6 +1912,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + /* Remove file check sysfs related directores/files, + * and wait for the pending file check operations */ + ocfs2_filecheck_remove_sysfs(osb); + + kset_unregister(osb->osb_dev_kset); + debugfs_remove(osb->osb_ctxt); /* Orphan scan should be stopped as early as possible */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c5898c59d411..c261c1dfd374 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -638,14 +638,17 @@ int ocfs2_calc_xattr_init(struct inode *dir, si->value_len); if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, "", NULL, 0); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (acl_len > 0) { a_size = ocfs2_xattr_entry_real_size(0, acl_len); if (S_ISDIR(mode)) a_size <<= 1; } else if (acl_len != 0 && acl_len != -ENODATA) { + ret = acl_len; mlog_errno(ret); return ret; } @@ -6415,7 +6418,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, * and then insert the extents one by one. */ if (xv->xr_list.l_tree_depth) { - memcpy(new_xv, &def_xv, sizeof(def_xv)); + memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE); vb->vb_xv = new_xv; vb->vb_bh = value_bh; ocfs2_init_xattr_value_extent_tree(&data_et, diff --git a/fs/proc/array.c b/fs/proc/array.c index d67a72dcb92c..598803576e4c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -736,16 +736,10 @@ static int children_seq_open(struct inode *inode, struct file *file) return ret; } -int children_seq_release(struct inode *inode, struct file *file) -{ - seq_release(inode, file); - return 0; -} - const struct file_operations proc_tid_children_operations = { .open = children_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = children_seq_release, + .release = seq_release, }; #endif /* CONFIG_PROC_CHILDREN */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 60316b52d659..9298324325ed 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -75,6 +75,7 @@ #include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/printk.h> +#include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> @@ -100,6 +101,8 @@ #include "internal.h" #include "fd.h" +#include "../../lib/kstrtox.h" + /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during @@ -110,8 +113,8 @@ * in /proc for a task before it execs a suid executable. */ -static u8 nlink_tid; -static u8 nlink_tgid; +static u8 nlink_tid __ro_after_init; +static u8 nlink_tgid __ro_after_init; struct pid_entry { const char *name; @@ -1370,7 +1373,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - WRITE_ONCE(task->fail_nth, n); + task->fail_nth = n; put_task_struct(task); return count; @@ -1386,8 +1389,7 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - len = snprintf(numbuf, sizeof(numbuf), "%u\n", - READ_ONCE(task->fail_nth)); + len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); len = simple_read_from_buffer(buf, count, ppos, numbuf, len); put_task_struct(task); @@ -1907,8 +1909,33 @@ end_instantiate: static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { - if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + const char *str = dentry->d_name.name; + unsigned long long sval, eval; + unsigned int len; + + len = _parse_integer(str, 16, &sval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (sval != (unsigned long)sval) return -EINVAL; + str += len; + + if (*str != '-') + return -EINVAL; + str++; + + len = _parse_integer(str, 16, &eval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (eval != (unsigned long)eval) + return -EINVAL; + str += len; + + if (*str != '\0') + return -EINVAL; + + *start = sval; + *end = eval; return 0; } @@ -2000,9 +2027,9 @@ out: } struct map_files_info { + unsigned long start; + unsigned long end; fmode_t mode; - unsigned int len; - unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; /* @@ -2172,10 +2199,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (++pos <= ctx->pos) continue; + info.start = vma->vm_start; + info.end = vma->vm_end; info.mode = vma->vm_file->f_mode; - info.len = snprintf(info.name, - sizeof(info.name), "%lx-%lx", - vma->vm_start, vma->vm_end); if (flex_array_put(fa, i++, &info, GFP_KERNEL)) BUG(); } @@ -2183,9 +2209,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) up_read(&mm->mmap_sem); for (i = 0; i < nr_files; i++) { + char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ + unsigned int len; + p = flex_array_get(fa, i); + len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, - p->name, p->len, + buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) @@ -3018,11 +3048,11 @@ static const struct inode_operations proc_tgid_base_inode_operations = { static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) { struct dentry *dentry, *leader, *dir; - char buf[PROC_NUMBUF]; + char buf[10 + 1]; struct qstr name; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); + name.len = snprintf(buf, sizeof(buf), "%u", pid); /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { @@ -3034,7 +3064,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) return; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", tgid); + name.len = snprintf(buf, sizeof(buf), "%u", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); if (!leader) goto out; @@ -3046,7 +3076,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) goto out_put_leader; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); + name.len = snprintf(buf, sizeof(buf), "%u", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { d_invalidate(dentry); @@ -3225,14 +3255,14 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; cond_resched(); if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) continue; - len = snprintf(name, sizeof(name), "%d", iter.tgid); + len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { @@ -3560,10 +3590,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; tid = task_pid_nr_ns(task, ns); - len = snprintf(name, sizeof(name), "%d", tid); + len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index 290ba85cb900..a8ac48aebd59 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -55,8 +55,7 @@ static int show_console_dev(struct seq_file *m, void *v) if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); - seq_printf(m, "\n"); - + seq_putc(m, '\n'); return 0; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 96fc70225e54..6b80cd1e419a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -236,7 +236,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, for (fd = ctx->pos - 2; fd < files_fdtable(files)->max_fds; fd++, ctx->pos++) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; if (!fcheck_files(files, fd)) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 793a67574668..5d709fa8f3a2 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -28,7 +28,7 @@ static DEFINE_RWLOCK(proc_subdir_lock); -static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) +static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) return -1; @@ -60,7 +60,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, struct proc_dir_entry *de = rb_entry(node, struct proc_dir_entry, subdir_node); - int result = proc_match(len, name, de); + int result = proc_match(name, de, len); if (result < 0) node = node->rb_left; @@ -84,7 +84,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *this = rb_entry(*new, struct proc_dir_entry, subdir_node); - int result = proc_match(de->namelen, de->name, this); + int result = proc_match(de->name, this, de->namelen); parent = *new; if (result < 0) @@ -211,8 +211,8 @@ void proc_free_inum(unsigned int inum) * Don't create negative dentries here, return -ENOENT by hand * instead. */ -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, - struct dentry *dentry) +struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, + struct proc_dir_entry *de) { struct inode *inode; @@ -235,7 +235,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - return proc_lookup_de(PDE(dir), dir, dentry); + return proc_lookup_de(dir, dentry, PDE(dir)); } /* @@ -247,8 +247,8 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *file, - struct dir_context *ctx) +int proc_readdir_de(struct file *file, struct dir_context *ctx, + struct proc_dir_entry *de) { int i; @@ -292,7 +292,7 @@ int proc_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - return proc_readdir_de(PDE(inode), file, ctx); + return proc_readdir_de(file, ctx, PDE(inode)); } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8dacaabb9f37..6e8724958116 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include <linux/cache.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> @@ -52,7 +53,7 @@ static void proc_evict_inode(struct inode *inode) } } -static struct kmem_cache * proc_inode_cachep; +static struct kmem_cache *proc_inode_cachep __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { @@ -128,12 +129,12 @@ enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { - return atomic_inc_unless_negative(&pde->in_use); + return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { - if (atomic_dec_return(&pde->in_use) == BIAS) + if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } @@ -166,7 +167,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (pdeo->c) + if (unlikely(pdeo->c)) complete(pdeo->c); kfree(pdeo); } @@ -420,7 +421,7 @@ static const char *proc_get_link(struct dentry *dentry, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); - if (unlikely(!use_pde(pde))) + if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 4a67188c8d74..d697c8ab0a14 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -31,24 +31,28 @@ struct mempolicy; * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { + /* + * number of callers into module in progress; + * negative -> it's going away RSN + */ + atomic_t in_use; + atomic_t count; /* use count */ + struct list_head pde_openers; /* who did ->open, but not ->release */ + /* protects ->pde_openers and all struct pde_opener instances */ + spinlock_t pde_unload_lock; + struct completion *pde_unload_completion; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + void *data; unsigned int low_ino; - umode_t mode; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; - const struct inode_operations *proc_iops; - const struct file_operations *proc_fops; struct proc_dir_entry *parent; struct rb_root_cached subdir; struct rb_node subdir_node; - void *data; - atomic_t count; /* use count */ - atomic_t in_use; /* number of callers into module in progress; */ - /* negative -> it's going away RSN */ - struct completion *pde_unload_completion; - struct list_head pde_openers; /* who did ->open, but not ->release */ - spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + umode_t mode; u8 namelen; char name[]; } __randomize_layout; @@ -149,10 +153,9 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i * generic.c */ extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); -extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, - struct dentry *); +struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); -extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); +int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4bc85cb8be6a..e8a93bc8285d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -512,23 +512,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) return -EFAULT; } else { if (kern_addr_valid(start)) { - unsigned long n; - /* * Using bounce buffer to bypass the * hardened user copy kernel text checks. */ - memcpy(buf, (char *) start, tsz); - n = copy_to_user(buffer, buf, tsz); - /* - * We cannot distinguish between fault on source - * and fault on destination. When this happens - * we clear too and hope it will trigger the - * EFAULT again. - */ - if (n) { - if (clear_user(buffer + tsz - n, - n)) + if (probe_kernel_read(buf, (void *) start, tsz)) { + if (clear_user(buffer, tsz)) + return -EFAULT; + } else { + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; } } else { diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a2bf369c923d..68c06ae7888c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -135,7 +135,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { - de = proc_lookup_de(net->proc_net, dir, dentry); + de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; @@ -172,7 +172,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, file, ctx); + ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; diff --git a/fs/proc/self.c b/fs/proc/self.c index 31326bb23b8b..4d7d061696b3 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> @@ -17,11 +18,11 @@ static const char *proc_self_get_link(struct dentry *dentry, if (!tgid) return ERR_PTR(-ENOENT); - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC); + /* max length of unsigned int in decimal + NULL term */ + name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d", tgid); + sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } @@ -30,7 +31,7 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum; +static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 339e4c1c044d..b66fc8de7d34 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -47,8 +47,11 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; - lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + /* split executable areas between text and lib */ + text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); + text = min(text, mm->exec_vm << PAGE_SHIFT); + lib = (mm->exec_vm << PAGE_SHIFT) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); seq_printf(m, "VmPeak:\t%8lu kB\n" @@ -76,7 +79,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) file << (PAGE_SHIFT-10), shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), text, lib, + mm->stack_vm << (PAGE_SHIFT-10), + text >> 10, + lib >> 10, mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); @@ -282,15 +287,18 @@ static void show_vma_header_prefix(struct seq_file *m, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + seq_put_hex_ll(m, NULL, start, 8); + seq_put_hex_ll(m, "-", end, 8); + seq_putc(m, ' '); + seq_putc(m, flags & VM_READ ? 'r' : '-'); + seq_putc(m, flags & VM_WRITE ? 'w' : '-'); + seq_putc(m, flags & VM_EXEC ? 'x' : '-'); + seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); + seq_put_hex_ll(m, " ", pgoff, 8); + seq_put_hex_ll(m, " ", MAJOR(dev), 2); + seq_put_hex_ll(m, ":", MINOR(dev), 2); + seq_put_decimal_ull(m, " ", ino); + seq_putc(m, ' '); } static void @@ -977,14 +985,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) + old = pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) + if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index b813e3b529f2..9d2efaca499f 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> @@ -18,11 +19,10 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, if (!pid) return ERR_PTR(-ENOENT); - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, - dentry ? GFP_KERNEL : GFP_ATOMIC); + name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d/task/%d", tgid, pid); + sprintf(name, "%u/task/%u", tgid, pid); set_delayed_call(done, kfree_link, name); return name; } @@ -31,7 +31,7 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum; +static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 885d445afa0d..a45f0af22a60 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1178,18 +1178,16 @@ fs_initcall(vmcore_init); /* Cleanup function for vmcore module. */ void vmcore_cleanup(void) { - struct list_head *pos, *next; - if (proc_vmcore) { proc_remove(proc_vmcore); proc_vmcore = NULL; } /* clear the vmcore list. */ - list_for_each_safe(pos, next, &vmcore_list) { + while (!list_empty(&vmcore_list)) { struct vmcore *m; - m = list_entry(pos, struct vmcore, list); + m = list_first_entry(&vmcore_list, struct vmcore, list); list_del(&m->list); kfree(m); } diff --git a/fs/seq_file.c b/fs/seq_file.c index eea09f6d8830..f36ab451ade2 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -698,11 +698,6 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, if (m->count + 1 >= m->size) goto overflow; - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; @@ -715,6 +710,51 @@ overflow: } EXPORT_SYMBOL(seq_put_decimal_ull); +/** + * seq_put_hex_ll - put a number in hexadecimal notation + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @v: the number + * @width: a minimum field width + * + * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "0x08llx", v) + * + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, int width) +{ + int i, len; + + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } + + /* If x is 0, the result of __builtin_clzll is undefined */ + if (v == 0) + len = 1; + else + len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; + + if (len < width) + len = width; + + if (m->count + len > m->size) { + seq_set_overflow(m); + return; + } + + for (i = len - 1; i >= 0; i--) { + m->buf[m->count + i] = hex_asc[0xf & v]; + v = v >> 4; + } + m->count += len; +} + void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; @@ -737,11 +777,6 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num num = -num; } - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; @@ -782,8 +817,14 @@ EXPORT_SYMBOL(seq_write); void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; - if (size > 0) - seq_printf(m, "%*s", size, ""); + if (size > 0) { + if (size + m->count > m->size) { + seq_set_overflow(m); + return; + } + memset(m->buf + m->count, ' ', size); + m->count += size; + } if (c) seq_putc(m, c); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 743eaa646898..87a13a7c8270 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -294,10 +294,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * pmd_trans_unstable) of the pmd. */ _pmd = READ_ONCE(*pmd); - if (!pmd_present(_pmd)) + if (pmd_none(_pmd)) goto out; ret = false; + if (!pmd_present(_pmd)) + goto out; + if (pmd_trans_huge(_pmd)) goto out; @@ -985,24 +988,14 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, struct uffd_msg *msg) { int fd; - struct file *file; - unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS; - fd = get_unused_fd_flags(flags); + fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, + O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); if (fd < 0) return fd; - file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | flags); - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - - fd_install(fd, file); msg->arg.reserved.reserved1 = 0; msg->arg.fork.ufd = fd; - return 0; } @@ -1884,24 +1877,10 @@ static void init_once_userfaultfd_ctx(void *mem) seqcount_init(&ctx->refile_seq); } -/** - * userfaultfd_file_create - Creates a userfaultfd file pointer. - * @flags: Flags for the userfaultfd file. - * - * This function creates a userfaultfd file pointer, w/out installing - * it into the fd table. This is useful when the userfaultfd file is - * used during the initialization of data structures that require - * extra setup after the userfaultfd creation. So the userfaultfd - * creation is split into the file pointer creation phase, and the - * file descriptor installation phase. In this way races with - * userspace closing the newly installed file descriptor can be - * avoided. Returns a userfaultfd file pointer, or a proper error - * pointer. - */ -static struct file *userfaultfd_file_create(int flags) +SYSCALL_DEFINE1(userfaultfd, int, flags) { - struct file *file; struct userfaultfd_ctx *ctx; + int fd; BUG_ON(!current->mm); @@ -1909,14 +1888,12 @@ static struct file *userfaultfd_file_create(int flags) BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); - file = ERR_PTR(-EINVAL); if (flags & ~UFFD_SHARED_FCNTL_FLAGS) - goto out; + return -EINVAL; - file = ERR_PTR(-ENOMEM); ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); if (!ctx) - goto out; + return -ENOMEM; atomic_set(&ctx->refcount, 1); ctx->flags = flags; @@ -1927,39 +1904,13 @@ static struct file *userfaultfd_file_create(int flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); - if (IS_ERR(file)) { + fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } -out: - return file; -} - -SYSCALL_DEFINE1(userfaultfd, int, flags) -{ - int fd, error; - struct file *file; - - error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); - if (error < 0) - return error; - fd = error; - - file = userfaultfd_file_create(flags); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_put_unused_fd; - } - fd_install(fd, file); - return fd; - -err_put_unused_fd: - put_unused_fd(fd); - - return error; } static int __init userfaultfd_init(void) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 1ba611e16fa0..8a1ee10014de 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -16,6 +16,22 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset); #endif +#ifndef find_next_and_bit +/** + * find_next_and_bit - find the next set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +extern unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset); +#endif + #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region @@ -55,8 +71,12 @@ extern unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size); #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ +#ifndef find_first_bit #define find_first_bit(addr, size) find_next_bit((addr), (size), 0) +#endif +#ifndef find_first_zero_bit #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +#endif #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 963b755d19b0..a7613e1b0c87 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -52,6 +52,7 @@ struct bug_entry { #ifndef HAVE_ARCH_BUG #define BUG() do { \ printk("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \ + barrier_before_unreachable(); \ panic("BUG!"); \ } while (0) #endif diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 868e68561f91..2cfa3075d148 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -309,19 +309,26 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif -#ifndef __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp); -#endif - -#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is an implementation of pmdp_establish() that is only suitable for an + * architecture that doesn't have hardware dirty/accessed bits. In this case we + * can't race with CPU which sets these bits and non-atomic aproach is fine. + */ +static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) { - + pmd_t old_pmd = *pmdp; + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + return old_pmd; } #endif +#ifndef __HAVE_ARCH_PMDP_INVALIDATE +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3489253e38fc..49aea0b37994 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -64,9 +64,14 @@ * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region * bitmap_release_region(bitmap, pos, order) Free specified bit region * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region - * bitmap_from_u32array(dst, nbits, buf, nwords) *dst = *buf (nwords 32b words) - * bitmap_to_u32array(buf, nwords, src, nbits) *buf = *dst (nwords 32b words) + * bitmap_from_arr32(dst, buf, nbits) Copy nbits from u32[] buf to dst + * bitmap_to_arr32(buf, src, nbits) Copy nbits from buf to u32[] dst * + * Note, bitmap_zero() and bitmap_fill() operate over the region of + * unsigned longs, that is, bits behind bitmap till the unsigned long + * boundary will be zeroed or filled as well. Consider to use + * bitmap_clear() or bitmap_set() to make explicit zeroing or filling + * respectively. */ /** @@ -83,8 +88,12 @@ * test_and_change_bit(bit, addr) Change bit and return old value * find_first_zero_bit(addr, nbits) Position first zero bit in *addr * find_first_bit(addr, nbits) Position first set bit in *addr - * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit + * find_next_zero_bit(addr, nbits, bit) + * Position next zero bit in *addr >= bit * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit + * find_next_and_bit(addr1, addr2, nbits, bit) + * Same as find_next_bit, but in + * (*addr1 & *addr2) * */ @@ -174,14 +183,7 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); -extern unsigned int bitmap_from_u32array(unsigned long *bitmap, - unsigned int nbits, - const u32 *buf, - unsigned int nwords); -extern unsigned int bitmap_to_u32array(u32 *buf, - unsigned int nwords, - const unsigned long *bitmap, - unsigned int nbits); + #ifdef __BIG_ENDIAN extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits); #else @@ -209,12 +211,12 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { - unsigned int nlongs = BITS_TO_LONGS(nbits); - if (!small_const_nbits(nbits)) { - unsigned int len = (nlongs - 1) * sizeof(unsigned long); - memset(dst, 0xff, len); + if (small_const_nbits(nbits)) + *dst = ~0UL; + else { + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + memset(dst, 0xff, len); } - dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); } static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, @@ -228,6 +230,35 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } } +/* + * Copy bitmap and clear tail bits in last word. + */ +static inline void bitmap_copy_safe(unsigned long *dst, + const unsigned long *src, unsigned int nbits) +{ + bitmap_copy(dst, src, nbits); + if (nbits % BITS_PER_LONG) + dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); +} + +/* + * On 32-bit systems bitmaps are represented as u32 arrays internally, and + * therefore conversion is not needed when copying data from/to arrays of u32. + */ +#if BITS_PER_LONG == 64 +extern void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, + unsigned int nbits); +extern void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, + unsigned int nbits); +#else +#define bitmap_from_arr32(bitmap, buf, nbits) \ + bitmap_copy_safe((unsigned long *) (bitmap), \ + (const unsigned long *) (buf), (nbits)) +#define bitmap_to_arr32(buf, bitmap, nbits) \ + bitmap_copy_safe((unsigned long *) (buf), \ + (const unsigned long *) (bitmap), (nbits)) +#endif + static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h index 3efed0d742a0..43d1fd50d433 100644 --- a/include/linux/build_bug.h +++ b/include/linux/build_bug.h @@ -8,7 +8,6 @@ #define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_ZERO(e) (0) -#define BUILD_BUG_ON_NULL(e) ((void *)0) #define BUILD_BUG_ON_INVALID(e) (0) #define BUILD_BUG_ON_MSG(cond, msg) (0) #define BUILD_BUG_ON(condition) (0) @@ -28,7 +27,6 @@ * aren't permitted). */ #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); })) -#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:(-!!(e)); })) /* * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 9f242b876fde..8e12d7c54308 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -81,6 +81,11 @@ enum { * Enable cpuset controller in v1 cgroup to use v2 behavior. */ CGRP_ROOT_CPUSET_V2_MODE = (1 << 4), + + /* + * Enable cgroup-aware OOM killer. + */ + CGRP_GROUP_OOM = (1 << 5), }; /* cftype->flags */ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 3b609edffa8f..d02a4df3f473 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -19,3 +19,11 @@ #define randomized_struct_fields_start struct { #define randomized_struct_fields_end }; + +/* all clang versions usable with the kernel support KASAN ABI version 5 */ +#define KASAN_ABI_VERSION 5 + +/* emulate gcc's __SANITIZE_ADDRESS__ flag */ +#if __has_feature(address_sanitizer) +#define __SANITIZE_ADDRESS__ +#endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 631354acfa72..12a695ffdfb5 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -205,6 +205,15 @@ #endif /* + * calling noreturn functions, __builtin_unreachable() and __builtin_trap() + * confuse the stack allocation in gcc, leading to overly large stack + * frames, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365 + * + * Adding an empty inline assembly before it works around the problem + */ +#define barrier_before_unreachable() asm volatile("") + +/* * Mark a position in code as unreachable. This can be used to * suppress control flow warnings after asm blocks that transfer * control elsewhere. @@ -214,7 +223,11 @@ * unreleased. Really, we need to have autoconf for the kernel. */ #define unreachable() \ - do { annotate_unreachable(); __builtin_unreachable(); } while (0) + do { \ + annotate_unreachable(); \ + barrier_before_unreachable(); \ + __builtin_unreachable(); \ + } while (0) /* Mark a function definition as prohibited from being cloned. */ #define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 52e611ab9a6c..97847f2f86cf 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -86,6 +86,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define barrier_data(ptr) barrier() #endif +/* workaround for GCC PR82365 if needed */ +#ifndef barrier_before_unreachable +# define barrier_before_unreachable() do { } while (0) +#endif + /* Unreachable code */ #ifdef CONFIG_STACK_VALIDATION /* diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 75b565194437..d4a2a7dcd72d 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -640,7 +640,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) /** * cpumask_size - size to allocate for a 'struct cpumask' in bytes */ -static inline size_t cpumask_size(void) +static inline unsigned int cpumask_size(void) { return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long); } diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index b511f6d24b42..e2013dd4d5a4 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -59,6 +59,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) extern u32 *vmcoreinfo_note; diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 604967609e55..83f81ac53282 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -2,6 +2,7 @@ #ifndef GENL_MAGIC_FUNC_H #define GENL_MAGIC_FUNC_H +#include <linux/build_bug.h> #include <linux/genl_magic_struct.h> /* @@ -132,17 +133,6 @@ static void dprint_array(const char *dir, int nla_type, * use one static buffer for parsing of nested attributes */ static struct nlattr *nested_attr_tb[128]; -#ifndef BUILD_BUG_ON -/* Force a compilation error if condition is true */ -#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) -/* Force a compilation error if condition is true, but also produce a - result (of value 0 and type size_t), so the expression can be used - e.g. in a structure initializer (or where-ever else comma expressions - aren't permitted). */ -#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) -#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) -#endif - #undef GENL_struct #define GENL_struct(tag_name, tag_number, s_name, s_fields) \ /* *_from_attrs functions are static, but potentially unused */ \ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 82a25880714a..36fa6a2a82e3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -129,7 +130,6 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; @@ -158,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) @@ -198,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) return false; } #define putback_active_hugepage(p) do {} while (0) +#define move_hugetlb_state(old, new, reason) do {} while (0) static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) @@ -271,6 +273,17 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) return sb->s_fs_info; } +struct hugetlbfs_inode_info { + struct shared_policy policy; + struct inode vfs_inode; + unsigned int seals; +}; + +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) +{ + return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); +} + extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, @@ -343,10 +356,10 @@ struct huge_bootmem_page { struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_node(struct hstate *h, int nid); -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -524,7 +537,7 @@ struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_noerr(v, a, r) NULL +#define alloc_huge_page_vma(h, vma, address) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index d79d1e7486bd..657a83b943f0 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -167,7 +167,7 @@ static inline int kallsyms_show_value(void) static inline void print_ip_sym(unsigned long ip) { - printk("[<%p>] %pS\n", (void *) ip, (void *) ip); + printk("[<%px>] %pS\n", (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index e3eb834c9a35..adc13474a53b 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -11,8 +11,6 @@ struct task_struct; #ifdef CONFIG_KASAN -#define KASAN_SHADOW_SCALE_SHIFT 3 - #include <asm/kasan.h> #include <asm/pgtable.h> @@ -56,14 +54,14 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object); void kasan_init_slab_obj(struct kmem_cache *cache, const void *object); void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); -void kasan_kfree_large(const void *ptr); -void kasan_poison_kfree(void *ptr); +void kasan_kfree_large(void *ptr, unsigned long ip); +void kasan_poison_kfree(void *ptr, unsigned long ip); void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags); void kasan_krealloc(const void *object, size_t new_size, gfp_t flags); void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); -bool kasan_slab_free(struct kmem_cache *s, void *object); +bool kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip); struct kasan_cache { int alloc_meta_offset; @@ -108,8 +106,8 @@ static inline void kasan_init_slab_obj(struct kmem_cache *cache, const void *object) {} static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {} -static inline void kasan_kfree_large(const void *ptr) {} -static inline void kasan_poison_kfree(void *ptr) {} +static inline void kasan_kfree_large(void *ptr, unsigned long ip) {} +static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {} static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, gfp_t flags) {} static inline void kasan_krealloc(const void *object, size_t new_size, @@ -117,7 +115,8 @@ static inline void kasan_krealloc(const void *object, size_t new_size, static inline void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) {} -static inline bool kasan_slab_free(struct kmem_cache *s, void *object) +static inline bool kasan_slab_free(struct kmem_cache *s, void *object, + unsigned long ip) { return false; } diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index bb8129a3474d..96def9d15b1b 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -32,6 +32,7 @@ struct list_lru_one { }; struct list_lru_memcg { + struct rcu_head rcu; /* array of per cgroup lists, indexed by memcg_cache_id */ struct list_lru_one *lru[0]; }; @@ -43,7 +44,7 @@ struct list_lru_node { struct list_lru_one lru; #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ - struct list_lru_memcg *memcg_lrus; + struct list_lru_memcg __rcu *memcg_lrus; #endif long nr_items; } ____cacheline_aligned_in_smp; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..7b8bcdf6571d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,6 +35,7 @@ struct mem_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct oom_control; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -108,7 +109,10 @@ struct lruvec_stat { */ struct mem_cgroup_per_node { struct lruvec lruvec; - struct lruvec_stat __percpu *lruvec_stat; + + struct lruvec_stat __percpu *lruvec_stat_cpu; + atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -199,6 +203,13 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; + /* + * Treat the sub-tree as an indivisible memory consumer, + * kill all belonging tasks if the memory cgroup selected + * as OOM victim. + */ + bool oom_group; + /* handle for "memory.events" */ struct cgroup_file events_file; @@ -227,10 +238,10 @@ struct mem_cgroup { spinlock_t move_lock; struct task_struct *move_lock_task; unsigned long move_lock_flags; - /* - * percpu counter. - */ - struct mem_cgroup_stat_cpu __percpu *stat; + + struct mem_cgroup_stat_cpu __percpu *stat_cpu; + atomic_long_t stat[MEMCG_NR_STAT]; + atomic_long_t events[MEMCG_NR_EVENTS]; unsigned long socket_pressure; @@ -265,6 +276,12 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; +/* + * size of first charge trial. "32" comes from vmscan.c's magic value. + * TODO: maybe necessary to use big numbers in big irons. + */ +#define MEMCG_CHARGE_BATCH 32U + extern struct mem_cgroup *root_mem_cgroup; static inline bool mem_cgroup_disabled(void) @@ -272,13 +289,6 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) -{ - this_cpu_inc(memcg->stat->events[event]); - cgroup_file_notify(&memcg->events_file); -} - bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, @@ -342,6 +352,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -480,6 +495,13 @@ static inline bool task_in_memcg_oom(struct task_struct *p) bool mem_cgroup_oom_synchronize(bool wait); +bool mem_cgroup_select_oom_victim(struct oom_control *oc); + +static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg) +{ + return memcg->oom_group; +} + #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -492,32 +514,38 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + long x = atomic_long_read(&memcg->stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - __this_cpu_add(memcg->stat->count[idx], val); + long x; + + if (mem_cgroup_disabled()) + return; + + x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->stat[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->count[idx], x); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->count[idx], val); + preempt_disable(); + __mod_memcg_state(memcg, idx, val); + preempt_enable(); } /** @@ -555,87 +583,108 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long val = 0; - int cpu; + long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - val += per_cpu(pn->lruvec_stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + x = atomic_long_read(&pn->lruvec_stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; + long x; + /* Update node */ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + if (mem_cgroup_disabled()) return; + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + + /* Update memcg */ __mod_memcg_state(pn->memcg, idx, val); - __this_cpu_add(pn->lruvec_stat->count[idx], val); + + /* Update lruvec */ + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &pn->lruvec_stat[idx]); + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; - - mod_node_page_state(lruvec_pgdat(lruvec), idx, val); - if (mem_cgroup_disabled()) - return; - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - mod_memcg_state(pn->memcg, idx, val); - this_cpu_add(pn->lruvec_stat->count[idx], val); + preempt_disable(); + __mod_lruvec_state(lruvec, idx, val); + preempt_enable(); } static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; + pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; - __mod_node_page_state(page_pgdat(page), idx, val); - if (mem_cgroup_disabled() || !page->mem_cgroup) + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!page->mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); return; - __mod_memcg_state(page->mem_cgroup, idx, val); - pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; - __this_cpu_add(pn->lruvec_stat->count[idx], val); + } + + lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup); + __mod_lruvec_state(lruvec, idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; - - mod_node_page_state(page_pgdat(page), idx, val); - if (mem_cgroup_disabled() || !page->mem_cgroup) - return; - mod_memcg_state(page->mem_cgroup, idx, val); - pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; - this_cpu_add(pn->lruvec_stat->count[idx], val); + preempt_disable(); + __mod_lruvec_page_state(page, idx, val); + preempt_enable(); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +/* idx can be of type enum memcg_event_item or vm_event_item */ +static inline void __count_memcg_events(struct mem_cgroup *memcg, + int idx, unsigned long count) +{ + unsigned long x; + + if (mem_cgroup_disabled()) + return; + + x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); + if (unlikely(x > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->events[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->events[idx], x); +} + static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) + int idx, unsigned long count) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->events[idx], count); + preempt_disable(); + __count_memcg_events(memcg, idx, count); + preempt_enable(); } -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_page_event(struct page *page, int idx) { @@ -654,12 +703,20 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) { - this_cpu_inc(memcg->stat->events[idx]); + count_memcg_events(memcg, idx, 1); if (idx == OOM_KILL) cgroup_file_notify(&memcg->events_file); } rcu_read_unlock(); } + +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) +{ + count_memcg_events(memcg, event, 1); + cgroup_file_notify(&memcg->events_file); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head); #endif @@ -744,6 +801,10 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } +static inline void mem_cgroup_put(struct mem_cgroup *memcg) +{ +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -936,6 +997,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc) +{ + return false; +} + +static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg) +{ + return false; +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index aba5f86eb038..787411bdeadb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -234,9 +234,6 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a2246cf670ba..0c6fe904bc97 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,8 +7,7 @@ #include <linux/migrate_mode.h> #include <linux/hugetlb.h> -typedef struct page *new_page_t(struct page *page, unsigned long private, - int **reason); +typedef struct page *new_page_t(struct page *page, unsigned long private); typedef void free_page_t(struct page *page, unsigned long private); /* @@ -43,9 +42,9 @@ static inline struct page *new_page_nodemask(struct page *page, return alloc_huge_page_nodemask(page_hstate(compound_head(page)), preferred_nid, nodemask); - if (thp_migration_supported() && PageTransHuge(page)) { - order = HPAGE_PMD_ORDER; + if (PageTransHuge(page)) { gfp_mask |= GFP_TRANSHUGE; + order = HPAGE_PMD_ORDER; } if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 92fe1b118707..42b7154291e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1312,8 +1312,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); @@ -1324,12 +1322,6 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address, int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); -static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) -{ - unmap_mapping_range(mapping, holebegin, holelen, 0); -} - extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -1344,6 +1336,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); +void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows); +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -1360,10 +1356,20 @@ static inline int fixup_user_fault(struct task_struct *tsk, BUG(); return -EFAULT; } +static inline void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows) { } +static inline void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, - unsigned int gup_flags); +static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) +{ + unmap_mapping_range(mapping, holebegin, holelen, 0); +} + +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, @@ -2090,6 +2096,7 @@ extern void setup_per_cpu_pageset(void); extern void zone_pcp_update(struct zone *zone); extern void zone_pcp_reset(struct zone *zone); +extern void setup_zone_pageset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cfd0ac4e5e0e..fd1af6b9591d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -31,28 +31,56 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. + * who is mapping it. If you allocate the page using alloc_pages(), you + * can use some of the space in struct page for your own purposes. * - * The objects in struct page are organized in double word blocks in - * order to allows us to use atomic double word operations on portions - * of struct page. That is currently only used by slub but the arrangement - * allows the use of atomic double word operations on the flags/mapping - * and lru list pointers also. + * Pages that were once in the page cache may be found under the RCU lock + * even after they have been recycled to a different purpose. The page + * cache reads and writes some of the fields in struct page to pin the + * page before checking that it's still in the page cache. It is vital + * that all users of struct page: + * 1. Use the first word as PageFlags. + * 2. Clear or preserve bit 0 of page->compound_head. It is used as + * PageTail for compound pages, and the page cache must not see false + * positives. Some users put a pointer here (guaranteed to be at least + * 4-byte aligned), other users avoid using the field altogether. + * 3. page->_refcount must either not be used, or must be used in such a + * way that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * + * If you allocate pages of order > 0, you can use the fields in the struct + * page associated with each page, but bear in mind that the pages may have + * been inserted individually into the page cache, so you must use the above + * four fields in a compatible way for each struct page. + * + * SLUB uses cmpxchg_double() to atomically update its freelist and + * counters. That requires that freelist & counters be adjacent and + * double-word aligned. We align all struct pages to double-word + * boundaries, and ensure that 'freelist' is aligned within the + * struct. */ +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE +#define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) +#define _slub_counter_t unsigned long +#else +#define _slub_counter_t unsigned int +#endif +#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#define _struct_page_alignment +#define _slub_counter_t unsigned int +#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ + struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ union { - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object - * or KSM private structure. See - * PAGE_MAPPING_ANON and - * PAGE_MAPPING_KSM. - */ + /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ @@ -66,40 +94,27 @@ struct page { }; union { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - /* Used for cmpxchg_double in slub */ - unsigned long counters; -#else - /* - * Keep _refcount separate from slub cmpxchg_double data. - * As the rest of the double word is protected by slab_lock - * but _refcount is not. - */ - unsigned counters; -#endif - struct { + _slub_counter_t counters; + unsigned int active; /* SLAB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + int units; /* SLOB */ + + struct { /* Page cache */ + /* + * Count of ptes mapped in mms, to show when + * page is mapped & limit reverse map searches. + * + * Extra information about page type may be + * stored here for pages that are never mapped, + * in which case the value MUST BE <= -2. + * See page-flags.h for more details. + */ + atomic_t _mapcount; - union { - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. - */ - atomic_t _mapcount; - - unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; - int units; /* SLOB */ - }; /* * Usage count, *USE WRAPPER FUNCTION* when manual * accounting. See page_ref.h @@ -109,8 +124,6 @@ struct page { }; /* - * Third double word block - * * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to * avoid collision and false-positive PageTail(). @@ -145,19 +158,9 @@ struct page { unsigned long compound_head; /* If bit zero is set */ /* First tail page only */ -#ifdef CONFIG_64BIT - /* - * On 64 bit system we have enough space in struct page - * to encode compound_dtor and compound_order with - * unsigned int. It can help compiler generate better or - * smaller code on some archtectures. - */ - unsigned int compound_dtor; - unsigned int compound_order; -#else - unsigned short int compound_dtor; - unsigned short int compound_order; -#endif + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS @@ -171,15 +174,14 @@ struct page { #endif }; - /* Remainder is not double word aligned */ union { - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for - * swp_entry_t if PageSwapCache; - * indicates order in the buddy - * system if PG_buddy is set. - */ + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; @@ -212,15 +214,7 @@ struct page { #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif -} -/* - * The struct page can be forced to be double word aligned so that atomic ops - * on double words work. The SLUB allocator can make use of such a feature. - */ -#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE - __aligned(2 * sizeof(unsigned long)) -#endif -; +} _struct_page_alignment; #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b25dc9db19fc..2d07a1ed5a31 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H +#include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> @@ -10,6 +11,9 @@ struct mmu_notifier; struct mmu_notifier_ops; +/* mmu_notifier_ops flags */ +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -27,6 +31,15 @@ struct mmu_notifier_mm { struct mmu_notifier_ops { /* + * Flags to specify behavior of callbacks for this MMU notifier. + * Used to determine which context an operation may be called. + * + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not + * block + */ + int flags; + + /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier @@ -137,6 +150,10 @@ struct mmu_notifier_ops { * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. + * + * If both of these callbacks cannot block, and invalidate_range + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, @@ -159,12 +176,13 @@ struct mmu_notifier_ops { * external TLB range needs to be flushed. For more in depth * discussion on this see Documentation/vm/mmu_notifier.txt * - * The invalidate_range() function is called under the ptl - * spin-lock and not allowed to sleep. - * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. + * + * If this callback cannot block, and invalidate_range_{start,end} + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -218,6 +236,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -457,6 +476,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + return false; +} + static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c38939..e09fe563d5dc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -886,7 +886,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; +extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, @@ -1166,8 +1166,16 @@ extern unsigned long usemap_size(void); /* * We use the lower bits of the mem_map pointer to store - * a little bit of information. There should be at least - * 3 bits here due to 32-bit alignment. + * a little bit of information. The pointer is calculated + * as mem_map - section_nr_to_pfn(pnum). The result is + * aligned to the minimum alignment of the two values: + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT + * lowest bits. PFN_SECTION_SHIFT is arch-specific + * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the + * worst combination is powerpc with 256k pages, + * which results in PFN_SECTION_SHIFT equal 6. + * To sum it up, at least 6 bits are available. */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) diff --git a/include/linux/oom.h b/include/linux/oom.h index 5bad038ac012..d4d41c01a74d 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -10,6 +10,13 @@ #include <linux/sched/coredump.h> /* MMF_* */ #include <linux/mm.h> /* VM_FAULT* */ + +/* + * Special value returned by victim selection functions to indicate + * that are inflight OOM victims. + */ +#define INFLIGHT_VICTIM ((void *)-1UL) + struct zonelist; struct notifier_block; struct mem_cgroup; @@ -40,7 +47,8 @@ struct oom_control { /* Used by oom implementation, do not set */ unsigned long totalpages; - struct task_struct *chosen; + struct task_struct *chosen_task; + struct mem_cgroup *chosen_memcg; unsigned long chosen_points; }; @@ -111,6 +119,8 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern int oom_evaluate_task(struct task_struct *task, void *arg); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index cdad58bbfd8b..4ae347cbc36d 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -63,7 +63,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages); -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp); +struct page *alloc_migrate_target(struct page *page, unsigned long private); #endif diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 5fb6580f7f23..6dc456ac6136 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -9,14 +9,14 @@ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H -/* 14 pointers + two long's align the pagevec structure to a power of two */ -#define PAGEVEC_SIZE 14 +/* 15 pointers + header align the pagevec structure to a power of two */ +#define PAGEVEC_SIZE 15 struct page; struct address_space; struct pagevec { - unsigned long nr; + unsigned char nr; bool percpu_pvec_drained; struct page *pages[PAGEVEC_SIZE]; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 166144c04ef6..ce5a27304b03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1489,6 +1489,11 @@ static inline struct thread_info *task_thread_info(struct task_struct *task) extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); +/* + * find a task by its virtual pid and get the task struct + */ +extern struct task_struct *find_get_task_by_vpid(pid_t nr); + extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3d49b91b674d..bd422561a75e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -11,7 +11,7 @@ /* * Routines for handling mm_structs */ -extern struct mm_struct * mm_alloc(void); +extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. @@ -35,27 +35,7 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } -/* mmdrop drops the mm and the page tables */ -extern void __mmdrop(struct mm_struct *); -static inline void mmdrop(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} - -static inline void mmdrop_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmdrop(mm); -} - -static inline void mmdrop_async(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) { - INIT_WORK(&mm->async_put_work, mmdrop_async_fn); - schedule_work(&mm->async_put_work); - } -} +extern void mmdrop(struct mm_struct *mm); /** * mmget() - Pin the address space associated with a &struct mm_struct. diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23b4f9cb82db..18f2b9a6c137 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -104,6 +104,9 @@ struct signal_struct { int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ + /* The signal sent when the parent dies: */ + int pdeath_signal_proc; + /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 09c6e28746f9..53f238934d7f 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -121,6 +121,9 @@ void seq_puts(struct seq_file *m, const char *s); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, int width); + void seq_escape(struct seq_file *m, const char *s, const char *esc); void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 06b295bec00d..73b5e655a76e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -112,13 +112,11 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_TMPFS -extern int shmem_add_seals(struct file *file, unsigned int seals); -extern int shmem_get_seals(struct file *file); -extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else -static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) { return -EINVAL; } diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 388ff2936a87..a3894918a436 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -75,6 +75,6 @@ struct shrinker { #define SHRINKER_NUMA_AWARE (1 << 0) #define SHRINKER_MEMCG_AWARE (1 << 1) -extern int register_shrinker(struct shrinker *); +extern __must_check int register_shrinker(struct shrinker *); extern void unregister_shrinker(struct shrinker *); #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index c2b8128799c1..a1a3f4ed94ce 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -332,20 +332,16 @@ extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); -extern void lru_add_drain_all_cpuslocked(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); -extern void add_page_to_unevictable_list(struct page *page); - extern void lru_cache_add_active_or_unevictable(struct page *page, struct vm_area_struct *vma); /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); -extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 33b0bdbb613c..d9c4a6cce3c2 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -17,6 +17,7 @@ #define _LINUX_UUID_H_ #include <uapi/linux/uuid.h> +#include <linux/string.h> #define UUID_SIZE 16 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c9817b39..a4c2317d8b9f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -216,23 +216,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, return x; } -static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, - enum node_stat_item item) -{ - long x = atomic_long_read(&pgdat->vm_stat[item]); - -#ifdef CONFIG_SMP - int cpu; - for_each_online_cpu(cpu) - x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; - - if (x < 0) - x = 0; -#endif - return x; -} - - #ifdef CONFIG_NUMA extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 004ba807df96..7238865e75b0 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); +bool zpool_evictable(struct zpool *pool); + #endif diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index d70b53e65f43..e0b8b9173e1c 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -192,12 +192,12 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, - long nr_objects_to_shrink, unsigned long pgs_scanned, - unsigned long lru_pgs, unsigned long cache_items, - unsigned long long delta, unsigned long total_scan), + long nr_objects_to_shrink, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan, + int priority), - TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs, - cache_items, delta, total_scan), + TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, + priority), TP_STRUCT__entry( __field(struct shrinker *, shr) @@ -205,11 +205,10 @@ TRACE_EVENT(mm_shrink_slab_start, __field(int, nid) __field(long, nr_objects_to_shrink) __field(gfp_t, gfp_flags) - __field(unsigned long, pgs_scanned) - __field(unsigned long, lru_pgs) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) + __field(int, priority) ), TP_fast_assign( @@ -218,24 +217,22 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = sc->gfp_mask; - __entry->pgs_scanned = pgs_scanned; - __entry->lru_pgs = lru_pgs; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; + __entry->priority = priority; ), - TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), - __entry->pgs_scanned, - __entry->lru_pgs, __entry->cache_items, __entry->delta, - __entry->total_scan) + __entry->total_scan, + __entry->priority) ); TRACE_EVENT(mm_shrink_slab_end, diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f8b134f5608f..e7ee32861d51 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -27,6 +27,9 @@ # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif +/* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ + /* * Flags for mlock */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index af5f8c2df87a..3165863aa187 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -207,4 +207,8 @@ struct prctl_mm_map { # define PR_SVE_VL_LEN_MASK 0xffff # define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Process-based variant of PDEATHSIG */ +#define PR_SET_PDEATHSIG_PROC 48 +#define PR_GET_PDEATHSIG_PROC 49 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 0f272818a4d2..be69135eb361 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -27,6 +27,10 @@ #include <linux/types.h> #include <linux/compiler.h> +#ifndef __KERNEL__ +#include <stddef.h> /* For size_t. */ +#endif + #define CTL_MAXNAME 10 /* how many path components do we allow in a call to sysctl? In other words, what is the largest acceptable value for the nlen diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h index 5c04130bb524..e5a7eecef7c3 100644 --- a/include/uapi/linux/uuid.h +++ b/include/uapi/linux/uuid.h @@ -19,7 +19,6 @@ #define _UAPI_LINUX_UUID_H_ #include <linux/types.h> -#include <linux/string.h> typedef struct { __u8 b[16]; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 690ae6665500..360e564ae7d1 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -596,7 +596,7 @@ static void wq_add(struct mqueue_inode_info *info, int sr, ewp->task = current; list_for_each_entry(walk, &info->e_wait_q[sr].list, list) { - if (walk->task->static_prio <= current->static_prio) { + if (walk->task->prio <= current->prio) { list_add_tail(&ewp->list, &walk->list); return; } diff --git a/ipc/msg.c b/ipc/msg.c index 1bbc029d2b17..0dcc6699dc53 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -476,9 +476,9 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, static int msgctl_stat(struct ipc_namespace *ns, int msqid, int cmd, struct msqid64_ds *p) { - int err; struct msg_queue *msq; - int success_return; + int id = 0; + int err; memset(p, 0, sizeof(*p)); @@ -489,14 +489,13 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, err = PTR_ERR(msq); goto out_unlock; } - success_return = msq->q_perm.id; + id = msq->q_perm.id; } else { msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } - success_return = 0; } err = -EACCES; @@ -507,6 +506,14 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, if (err) goto out_unlock; + ipc_lock_object(&msq->q_perm); + + if (!ipc_valid_object(&msq->q_perm)) { + ipc_unlock_object(&msq->q_perm); + err = -EIDRM; + goto out_unlock; + } + kernel_to_ipc64_perm(&msq->q_perm, &p->msg_perm); p->msg_stime = msq->q_stime; p->msg_rtime = msq->q_rtime; @@ -516,9 +523,10 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, p->msg_qbytes = msq->q_qbytes; p->msg_lspid = msq->q_lspid; p->msg_lrpid = msq->q_lrpid; - rcu_read_unlock(); - return success_return; + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); + return id; out_unlock: rcu_read_unlock(); diff --git a/ipc/sem.c b/ipc/sem.c index 87bd38f38dc3..a4af04979fd2 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1213,10 +1213,20 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, if (err) goto out_unlock; + ipc_lock_object(&sma->sem_perm); + + if (!ipc_valid_object(&sma->sem_perm)) { + ipc_unlock_object(&sma->sem_perm); + err = -EIDRM; + goto out_unlock; + } + kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm); semid64->sem_otime = get_semotime(sma); semid64->sem_ctime = sma->sem_ctime; semid64->sem_nsems = sma->sem_nsems; + + ipc_unlock_object(&sma->sem_perm); rcu_read_unlock(); return id; diff --git a/ipc/shm.c b/ipc/shm.c index 7acda23430aa..4643865e9171 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -909,9 +909,11 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, int cmd, struct shmid64_ds *tbuf) { struct shmid_kernel *shp; - int result; + int id = 0; int err; + memset(tbuf, 0, sizeof(*tbuf)); + rcu_read_lock(); if (cmd == SHM_STAT) { shp = shm_obtain_object(ns, shmid); @@ -919,14 +921,13 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, err = PTR_ERR(shp); goto out_unlock; } - result = shp->shm_perm.id; + id = shp->shm_perm.id; } else { shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } - result = 0; } err = -EACCES; @@ -937,7 +938,14 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, if (err) goto out_unlock; - memset(tbuf, 0, sizeof(*tbuf)); + ipc_lock_object(&shp->shm_perm); + + if (!ipc_valid_object(&shp->shm_perm)) { + ipc_unlock_object(&shp->shm_perm); + err = -EIDRM; + goto out_unlock; + } + kernel_to_ipc64_perm(&shp->shm_perm, &tbuf->shm_perm); tbuf->shm_segsz = shp->shm_segsz; tbuf->shm_atime = shp->shm_atim; @@ -946,8 +954,10 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, tbuf->shm_cpid = shp->shm_cprid; tbuf->shm_lpid = shp->shm_lprid; tbuf->shm_nattch = shp->shm_nattch; + + ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); - return result; + return id; out_unlock: rcu_read_unlock(); diff --git a/ipc/util.c b/ipc/util.c index ff045fec8d83..4ed5a17dd06f 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -23,9 +23,12 @@ * tree. * - perform initial checks (capabilities, auditing and permission, * etc). - * - perform read-only operations, such as STAT, INFO commands. + * - perform read-only operations, such as INFO command, that + * do not demand atomicity * acquire the ipc lock (kern_ipc_perm.lock) through * ipc_lock_object() + * - perform read-only operations that demand atomicity, + * such as STAT command. * - perform data updates, such as SET, RMID commands and * mechanism-specific operations (semop/semtimedop, * msgsnd/msgrcv, shmat/shmdt). diff --git a/kernel/async.c b/kernel/async.c index 2cbd3dd5940d..a893d6170944 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -84,20 +84,24 @@ static atomic_t entry_count; static async_cookie_t lowest_in_progress(struct async_domain *domain) { - struct list_head *pending; + struct async_entry *first = NULL; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); - if (domain) - pending = &domain->pending; - else - pending = &async_global_pending; + if (domain) { + if (!list_empty(&domain->pending)) + first = list_first_entry(&domain->pending, + struct async_entry, domain_list); + } else { + if (!list_empty(&async_global_pending)) + first = list_first_entry(&async_global_pending, + struct async_entry, global_list); + } - if (!list_empty(pending)) - ret = list_first_entry(pending, struct async_entry, - domain_list)->cookie; + if (first) + ret = first->cookie; spin_unlock_irqrestore(&async_lock, flags); return ret; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b4a0d898a796..55fff758a803 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1732,6 +1732,9 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) if (!strcmp(token, "nsdelegate")) { *root_flags |= CGRP_ROOT_NS_DELEGATE; continue; + } else if (!strcmp(token, "groupoom")) { + *root_flags |= CGRP_GROUP_OOM; + continue; } pr_err("cgroup2: unknown option \"%s\"\n", token); @@ -1748,6 +1751,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + + if (root_flags & CGRP_GROUP_OOM) + cgrp_dfl_root.flags |= CGRP_GROUP_OOM; + else + cgrp_dfl_root.flags &= ~CGRP_GROUP_OOM; } } @@ -1755,6 +1763,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_GROUP_OOM) + seq_puts(seq, ",groupoom"); return 0; } @@ -5912,7 +5922,8 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); + return snprintf(buf, PAGE_SIZE, "nsdelegate\n" + "groupoom\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/compat.c b/kernel/compat.c index d1cee656a7ed..3247fe761f60 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -355,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); + unsigned int retlen = min(len, cpumask_size()); if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) ret = -EFAULT; diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 7fa0c4ae6394..9bfdffc100da 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -10,3 +10,7 @@ CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y +CONFIG_CC_STACKPROTECTOR_NONE=y +# CONFIG_CC_STACKPROTECTOR_REGULAR is not set +# CONFIG_CC_STACKPROTECTOR_STRONG is not set +# CONFIG_CC_STACKPROTECTOR_AUTO is not set diff --git a/kernel/cred.c b/kernel/cred.c index ecf03657e71c..0192a94670e1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -448,6 +448,7 @@ int commit_creds(struct cred *new) if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; + task->signal->pdeath_signal_proc = 0; smp_wmb(); } diff --git a/kernel/exit.c b/kernel/exit.c index 995453d9fb55..42ca71a44c9a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -635,6 +635,10 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (unlikely(p->exit_state == EXIT_DEAD)) return; + if (p->signal->pdeath_signal_proc) + group_send_sig_info(p->signal->pdeath_signal_proc, + SEND_SIG_NOINFO, p); + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; diff --git a/kernel/fork.c b/kernel/fork.c index bed0eaf7233f..2113e252cb9d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> +#include <linux/sched/mm.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> @@ -391,6 +392,241 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +#ifdef CONFIG_MMU +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + LIST_HEAD(uf); + + uprobe_start_dup_mmap(); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); + if (tmp->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); +fail_uprobe_end: + uprobe_end_dup_mmap(); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct *mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct *mm) +{ + pgd_free(mm, mm->pgd); +} +#else +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); +#endif +} + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +static void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + hmm_mm_destroy(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); + put_user_ns(mm->user_ns); + free_mm(mm); +} + +void mmdrop(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmdrop); + +static void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm; + + mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); @@ -613,181 +849,8 @@ free_tsk: return NULL; } -#ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - LIST_HEAD(uf); - - uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ - tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - down_write(&oldmm->mmap_sem); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); - return 0; -} -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) @@ -877,27 +940,6 @@ fail_nopgd: return NULL; } -static void check_mm(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); - - if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); - } - - if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -#endif -} - /* * Allocate and initialize an mm_struct. */ @@ -913,24 +955,6 @@ struct mm_struct *mm_alloc(void) return mm_init(mm, current, current_user_ns()); } -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - hmm_mm_destroy(mm); - mmu_notifier_mm_destroy(mm); - check_mm(mm); - put_user_ns(mm->user_ns); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); @@ -1451,6 +1475,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) mutex_init(&sig->cred_guard_mutex); + sig->pdeath_signal_proc = current->signal->pdeath_signal_proc; + return 0; } @@ -1563,6 +1589,10 @@ static __latent_entropy struct task_struct *copy_process( int retval; struct task_struct *p; + /* + * Don't allow sharing the root directory with processes in a different + * namespace + */ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -2038,6 +2068,8 @@ long _do_fork(unsigned long clone_flags, int __user *child_tidptr, unsigned long tls) { + struct completion vfork; + struct pid *pid; struct task_struct *p; int trace = 0; long nr; @@ -2063,43 +2095,40 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); + + if (IS_ERR(p)) + return PTR_ERR(p); + /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ - if (!IS_ERR(p)) { - struct completion vfork; - struct pid *pid; - - trace_sched_process_fork(current, p); + trace_sched_process_fork(current, p); - pid = get_task_pid(p, PIDTYPE_PID); - nr = pid_vnr(pid); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); - if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); + if (clone_flags & CLONE_PARENT_SETTID) + put_user(nr, parent_tidptr); - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; - init_completion(&vfork); - get_task_struct(p); - } - - wake_up_new_task(p); + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); + get_task_struct(p); + } - /* forking complete and child started to run, tell ptracer */ - if (unlikely(trace)) - ptrace_event_pid(trace, pid); + wake_up_new_task(p); - if (clone_flags & CLONE_VFORK) { - if (!wait_for_vfork_done(p, &vfork)) - ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); - } + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) + ptrace_event_pid(trace, pid); - put_pid(pid); - } else { - nr = PTR_ERR(p); + if (clone_flags & CLONE_VFORK) { + if (!wait_for_vfork_done(p, &vfork)) + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); return nr; } diff --git a/kernel/futex.c b/kernel/futex.c index 7f719d110908..1f450e092c74 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -862,24 +862,6 @@ static void put_pi_state(struct futex_pi_state *pi_state) } } -/* - * Look up the task based on what TID userspace gave us. - * We dont trust it. - */ -static struct task_struct *futex_find_get_task(pid_t pid) -{ - struct task_struct *p; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) - get_task_struct(p); - - rcu_read_unlock(); - - return p; -} - #ifdef CONFIG_FUTEX_PI /* @@ -1183,7 +1165,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, */ if (!pid) return -ESRCH; - p = futex_find_get_task(pid); + p = find_get_task_by_vpid(pid); if (!p) return -ESRCH; diff --git a/kernel/kcov.c b/kernel/kcov.c index 7594c033d98a..2c16f1ab5e10 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -358,7 +358,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, */ if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; - if (kcov->t != NULL) + t = current; + if (kcov->t != NULL || t->kcov != NULL) return -EBUSY; if (arg == KCOV_TRACE_PC) kcov->mode = KCOV_MODE_TRACE_PC; @@ -370,7 +371,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, #endif else return -EINVAL; - t = current; /* Cache in task struct for performance. */ t->kcov_size = kcov->size; t->kcov_area = kcov->area; diff --git a/kernel/pid.c b/kernel/pid.c index 5d30c87e3c42..ed6c343fe50d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -343,6 +343,19 @@ struct task_struct *find_task_by_vpid(pid_t vnr) return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } +struct task_struct *find_get_task_by_vpid(pid_t nr) +{ + struct task_struct *task; + + rcu_read_lock(); + task = find_task_by_vpid(nr); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 5e1d713c8e61..21fec73d45d4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1103,21 +1103,6 @@ int ptrace_request(struct task_struct *child, long request, return ret; } -static struct task_struct *ptrace_get_task_struct(pid_t pid) -{ - struct task_struct *child; - - rcu_read_lock(); - child = find_task_by_vpid(pid); - if (child) - get_task_struct(child); - rcu_read_unlock(); - - if (!child) - return ERR_PTR(-ESRCH); - return child; -} - #ifndef arch_ptrace_attach #define arch_ptrace_attach(child) do { } while (0) #endif @@ -1135,9 +1120,9 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out; } - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); + child = find_get_task_by_vpid(pid); + if (!child) { + ret = -ESRCH; goto out; } @@ -1281,9 +1266,9 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, goto out; } - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); + child = find_get_task_by_vpid(pid); + if (!child) { + ret = -ESRCH; goto out; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3da7a2444a91..d99fb5bf61ec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4853,7 +4853,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); + unsigned int retlen = min(len, cpumask_size()); if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; diff --git a/kernel/sys.c b/kernel/sys.c index f2289de20e19..31a2866b7abd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2213,6 +2213,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); break; + case PR_SET_PDEATHSIG_PROC: + if (!valid_signal(arg2)) { + error = -EINVAL; + break; + } + me->signal->pdeath_signal_proc = arg2; + break; + case PR_GET_PDEATHSIG_PROC: + error = put_user(me->signal->pdeath_signal_proc, + (int __user *)arg2); + break; case PR_GET_DUMPABLE: error = get_dumpable(me->mm); break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..2fb4e27c636a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "nr_overcommit_hugepages", .data = NULL, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4559e914452b..4e62a4a8fa91 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -194,11 +194,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) { struct task_struct *tsk; - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); + tsk = find_get_task_by_vpid(pid); if (!tsk) return -ESRCH; fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 64d7c19d3167..1a1423923bcf 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1641,7 +1641,10 @@ config DMA_API_DEBUG If unsure, say N. -menu "Runtime Testing" +menuconfig RUNTIME_TESTING_MENU + bool "Runtime Testing" + +if RUNTIME_TESTING_MENU config LKDTM tristate "Linux Kernel Dump Test Tool Module" @@ -1841,7 +1844,7 @@ config TEST_BPF If unsure, say N. -config TEST_FIND_BIT +config FIND_BIT_BENCHMARK tristate "Test find_bit functions" default n help @@ -1929,7 +1932,7 @@ config TEST_DEBUG_VIRTUAL If unsure, say N. -endmenu # runtime tests +endif # RUNTIME_TESTING_MENU config MEMTEST bool "Memtest" diff --git a/lib/Makefile b/lib/Makefile index 7adb066692b3..a90d4fcd748f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -46,8 +46,8 @@ obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += hexdump.o obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o +obj-$(CONFIG_FIND_BIT_BENCHMARK) += find_bit_benchmark.o obj-$(CONFIG_TEST_BPF) += test_bpf.o -obj-$(CONFIG_TEST_FIND_BIT) += test_find_bit.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o diff --git a/lib/bitmap.c b/lib/bitmap.c index d8f0c094b18e..9e498c77ed0e 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1106,111 +1106,80 @@ int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) EXPORT_SYMBOL(bitmap_allocate_region); /** - * bitmap_from_u32array - copy the contents of a u32 array of bits to bitmap - * @bitmap: array of unsigned longs, the destination bitmap, non NULL - * @nbits: number of bits in @bitmap - * @buf: array of u32 (in host byte order), the source bitmap, non NULL - * @nwords: number of u32 words in @buf - * - * copy min(nbits, 32*nwords) bits from @buf to @bitmap, remaining - * bits between nword and nbits in @bitmap (if any) are cleared. In - * last word of @bitmap, the bits beyond nbits (if any) are kept - * unchanged. + * bitmap_copy_le - copy a bitmap, putting the bits into little-endian order. + * @dst: destination buffer + * @src: bitmap to copy + * @nbits: number of bits in the bitmap * - * Return the number of bits effectively copied. + * Require nbits % BITS_PER_LONG == 0. */ -unsigned int -bitmap_from_u32array(unsigned long *bitmap, unsigned int nbits, - const u32 *buf, unsigned int nwords) +#ifdef __BIG_ENDIAN +void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits) { - unsigned int dst_idx, src_idx; - - for (src_idx = dst_idx = 0; dst_idx < BITS_TO_LONGS(nbits); ++dst_idx) { - unsigned long part = 0; - - if (src_idx < nwords) - part = buf[src_idx++]; - -#if BITS_PER_LONG == 64 - if (src_idx < nwords) - part |= ((unsigned long) buf[src_idx++]) << 32; -#endif - - if (dst_idx < nbits/BITS_PER_LONG) - bitmap[dst_idx] = part; - else { - unsigned long mask = BITMAP_LAST_WORD_MASK(nbits); + unsigned int i; - bitmap[dst_idx] = (bitmap[dst_idx] & ~mask) - | (part & mask); - } + for (i = 0; i < nbits/BITS_PER_LONG; i++) { + if (BITS_PER_LONG == 64) + dst[i] = cpu_to_le64(src[i]); + else + dst[i] = cpu_to_le32(src[i]); } - - return min_t(unsigned int, nbits, 32*nwords); } -EXPORT_SYMBOL(bitmap_from_u32array); +EXPORT_SYMBOL(bitmap_copy_le); +#endif +#if BITS_PER_LONG == 64 /** - * bitmap_to_u32array - copy the contents of bitmap to a u32 array of bits - * @buf: array of u32 (in host byte order), the dest bitmap, non NULL - * @nwords: number of u32 words in @buf - * @bitmap: array of unsigned longs, the source bitmap, non NULL + * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap + * @bitmap: array of unsigned longs, the destination bitmap + * @buf: array of u32 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap - * - * copy min(nbits, 32*nwords) bits from @bitmap to @buf. Remaining - * bits after nbits in @buf (if any) are cleared. - * - * Return the number of bits effectively copied. */ -unsigned int -bitmap_to_u32array(u32 *buf, unsigned int nwords, - const unsigned long *bitmap, unsigned int nbits) +void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, + unsigned int nbits) { - unsigned int dst_idx = 0, src_idx = 0; - - while (dst_idx < nwords) { - unsigned long part = 0; - - if (src_idx < BITS_TO_LONGS(nbits)) { - part = bitmap[src_idx]; - if (src_idx >= nbits/BITS_PER_LONG) - part &= BITMAP_LAST_WORD_MASK(nbits); - src_idx++; - } + unsigned int i, halfwords; - buf[dst_idx++] = part & 0xffffffffUL; + if (!nbits) + return; -#if BITS_PER_LONG == 64 - if (dst_idx < nwords) { - part >>= 32; - buf[dst_idx++] = part & 0xffffffffUL; - } -#endif + halfwords = DIV_ROUND_UP(nbits, 32); + for (i = 0; i < halfwords; i++) { + bitmap[i/2] = (unsigned long) buf[i]; + if (++i < halfwords) + bitmap[i/2] |= ((unsigned long) buf[i]) << 32; } - return min_t(unsigned int, nbits, 32*nwords); + /* Clear tail bits in last word beyond nbits. */ + if (nbits % BITS_PER_LONG) + bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits); } -EXPORT_SYMBOL(bitmap_to_u32array); +EXPORT_SYMBOL(bitmap_from_arr32); /** - * bitmap_copy_le - copy a bitmap, putting the bits into little-endian order. - * @dst: destination buffer - * @src: bitmap to copy - * @nbits: number of bits in the bitmap - * - * Require nbits % BITS_PER_LONG == 0. + * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits + * @buf: array of u32 (in host byte order), the dest bitmap + * @bitmap: array of unsigned longs, the source bitmap + * @nbits: number of bits in @bitmap */ -#ifdef __BIG_ENDIAN -void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits) +void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits) { - unsigned int i; + unsigned int i, halfwords; - for (i = 0; i < nbits/BITS_PER_LONG; i++) { - if (BITS_PER_LONG == 64) - dst[i] = cpu_to_le64(src[i]); - else - dst[i] = cpu_to_le32(src[i]); + if (!nbits) + return; + + halfwords = DIV_ROUND_UP(nbits, 32); + for (i = 0; i < halfwords; i++) { + buf[i] = (u32) (bitmap[i/2] & UINT_MAX); + if (++i < halfwords) + buf[i] = (u32) (bitmap[i/2] >> 32); } + + /* Clear tail bits in last element of array beyond nbits. */ + if (nbits % BITS_PER_LONG) + buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31)); } -EXPORT_SYMBOL(bitmap_copy_le); +EXPORT_SYMBOL(bitmap_to_arr32); + #endif diff --git a/lib/cpumask.c b/lib/cpumask.c index 35fe142ebb5e..beca6244671a 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -33,10 +33,11 @@ EXPORT_SYMBOL(cpumask_next); int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) - if (cpumask_test_cpu(n, src2p)) - break; - return n; + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits, n + 1); } EXPORT_SYMBOL(cpumask_next_and); diff --git a/lib/find_bit.c b/lib/find_bit.c index 6ed74f78380c..ee3df93ba69a 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -21,22 +21,29 @@ #include <linux/export.h> #include <linux/kernel.h> -#if !defined(find_next_bit) || !defined(find_next_zero_bit) +#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ + !defined(find_next_and_bit) /* - * This is a common helper function for find_next_bit and - * find_next_zero_bit. The difference is the "invert" argument, which - * is XORed with each fetched word before searching it for one bits. + * This is a common helper function for find_next_bit, find_next_zero_bit, and + * find_next_and_bit. The differences are: + * - The "invert" argument, which is XORed with each fetched word before + * searching it for one bits. + * - The optional "addr2", which is anded with "addr1" if present. */ -static unsigned long _find_next_bit(const unsigned long *addr, - unsigned long nbits, unsigned long start, unsigned long invert) +static inline unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert) { unsigned long tmp; if (unlikely(start >= nbits)) return nbits; - tmp = addr[start / BITS_PER_LONG] ^ invert; + tmp = addr1[start / BITS_PER_LONG]; + if (addr2) + tmp &= addr2[start / BITS_PER_LONG]; + tmp ^= invert; /* Handle 1st word. */ tmp &= BITMAP_FIRST_WORD_MASK(start); @@ -47,7 +54,10 @@ static unsigned long _find_next_bit(const unsigned long *addr, if (start >= nbits) return nbits; - tmp = addr[start / BITS_PER_LONG] ^ invert; + tmp = addr1[start / BITS_PER_LONG]; + if (addr2) + tmp &= addr2[start / BITS_PER_LONG]; + tmp ^= invert; } return min(start + __ffs(tmp), nbits); @@ -61,7 +71,7 @@ static unsigned long _find_next_bit(const unsigned long *addr, unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - return _find_next_bit(addr, size, offset, 0UL); + return _find_next_bit(addr, NULL, size, offset, 0UL); } EXPORT_SYMBOL(find_next_bit); #endif @@ -70,11 +80,21 @@ EXPORT_SYMBOL(find_next_bit); unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - return _find_next_bit(addr, size, offset, ~0UL); + return _find_next_bit(addr, NULL, size, offset, ~0UL); } EXPORT_SYMBOL(find_next_zero_bit); #endif +#if !defined(find_next_and_bit) +unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr1, addr2, size, offset, 0UL); +} +EXPORT_SYMBOL(find_next_and_bit); +#endif + #ifndef find_first_bit /* * Find the first set bit in a memory region. @@ -146,15 +166,19 @@ static inline unsigned long ext2_swab(const unsigned long y) } #if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) -static unsigned long _find_next_bit_le(const unsigned long *addr, - unsigned long nbits, unsigned long start, unsigned long invert) +static inline unsigned long _find_next_bit_le(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert) { unsigned long tmp; if (unlikely(start >= nbits)) return nbits; - tmp = addr[start / BITS_PER_LONG] ^ invert; + tmp = addr1[start / BITS_PER_LONG]; + if (addr2) + tmp &= addr2[start / BITS_PER_LONG]; + tmp ^= invert; /* Handle 1st word. */ tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start)); @@ -165,7 +189,10 @@ static unsigned long _find_next_bit_le(const unsigned long *addr, if (start >= nbits) return nbits; - tmp = addr[start / BITS_PER_LONG] ^ invert; + tmp = addr1[start / BITS_PER_LONG]; + if (addr2) + tmp &= addr2[start / BITS_PER_LONG]; + tmp ^= invert; } return min(start + __ffs(ext2_swab(tmp)), nbits); @@ -176,7 +203,7 @@ static unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { - return _find_next_bit_le(addr, size, offset, ~0UL); + return _find_next_bit_le(addr, NULL, size, offset, ~0UL); } EXPORT_SYMBOL(find_next_zero_bit_le); #endif @@ -185,7 +212,7 @@ EXPORT_SYMBOL(find_next_zero_bit_le); unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { - return _find_next_bit_le(addr, size, offset, 0UL); + return _find_next_bit_le(addr, NULL, size, offset, 0UL); } EXPORT_SYMBOL(find_next_bit_le); #endif diff --git a/lib/test_find_bit.c b/lib/find_bit_benchmark.c index f4394a36f9aa..5985a25e6cbc 100644 --- a/lib/test_find_bit.c +++ b/lib/find_bit_benchmark.c @@ -35,6 +35,7 @@ #define SPARSE 500 static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata; +static DECLARE_BITMAP(bitmap2, BITMAP_LEN) __initdata; /* * This is Schlemiel the Painter's algorithm. It should be called after @@ -43,16 +44,15 @@ static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata; static int __init test_find_first_bit(void *bitmap, unsigned long len) { unsigned long i, cnt; - cycles_t cycles; + ktime_t time; - cycles = get_cycles(); + time = ktime_get(); for (cnt = i = 0; i < len; cnt++) { i = find_first_bit(bitmap, len); __clear_bit(i, bitmap); } - cycles = get_cycles() - cycles; - pr_err("find_first_bit:\t\t%llu cycles,\t%ld iterations\n", - (u64)cycles, cnt); + time = ktime_get() - time; + pr_err("find_first_bit: %18llu ns, %6ld iterations\n", time, cnt); return 0; } @@ -60,14 +60,13 @@ static int __init test_find_first_bit(void *bitmap, unsigned long len) static int __init test_find_next_bit(const void *bitmap, unsigned long len) { unsigned long i, cnt; - cycles_t cycles; + ktime_t time; - cycles = get_cycles(); + time = ktime_get(); for (cnt = i = 0; i < BITMAP_LEN; cnt++) i = find_next_bit(bitmap, BITMAP_LEN, i) + 1; - cycles = get_cycles() - cycles; - pr_err("find_next_bit:\t\t%llu cycles,\t%ld iterations\n", - (u64)cycles, cnt); + time = ktime_get() - time; + pr_err("find_next_bit: %18llu ns, %6ld iterations\n", time, cnt); return 0; } @@ -75,14 +74,13 @@ static int __init test_find_next_bit(const void *bitmap, unsigned long len) static int __init test_find_next_zero_bit(const void *bitmap, unsigned long len) { unsigned long i, cnt; - cycles_t cycles; + ktime_t time; - cycles = get_cycles(); + time = ktime_get(); for (cnt = i = 0; i < BITMAP_LEN; cnt++) i = find_next_zero_bit(bitmap, len, i) + 1; - cycles = get_cycles() - cycles; - pr_err("find_next_zero_bit:\t%llu cycles,\t%ld iterations\n", - (u64)cycles, cnt); + time = ktime_get() - time; + pr_err("find_next_zero_bit: %18llu ns, %6ld iterations\n", time, cnt); return 0; } @@ -90,9 +88,9 @@ static int __init test_find_next_zero_bit(const void *bitmap, unsigned long len) static int __init test_find_last_bit(const void *bitmap, unsigned long len) { unsigned long l, cnt = 0; - cycles_t cycles; + ktime_t time; - cycles = get_cycles(); + time = ktime_get(); do { cnt++; l = find_last_bit(bitmap, len); @@ -100,9 +98,24 @@ static int __init test_find_last_bit(const void *bitmap, unsigned long len) break; len = l; } while (len); + time = ktime_get() - time; + pr_err("find_last_bit: %18llu ns, %6ld iterations\n", time, cnt); + + return 0; +} + +static int __init test_find_next_and_bit(const void *bitmap, + const void *bitmap2, unsigned long len) +{ + unsigned long i, cnt; + cycles_t cycles; + + cycles = get_cycles(); + for (cnt = i = 0; i < BITMAP_LEN; cnt++) + i = find_next_and_bit(bitmap, bitmap2, BITMAP_LEN, i+1); cycles = get_cycles() - cycles; - pr_err("find_last_bit:\t\t%llu cycles,\t%ld iterations\n", - (u64)cycles, cnt); + pr_err("find_next_and_bit:\t\t%llu cycles, %ld iterations\n", + (u64)cycles, cnt); return 0; } @@ -114,31 +127,36 @@ static int __init find_bit_test(void) pr_err("\nStart testing find_bit() with random-filled bitmap\n"); get_random_bytes(bitmap, sizeof(bitmap)); + get_random_bytes(bitmap2, sizeof(bitmap2)); test_find_next_bit(bitmap, BITMAP_LEN); test_find_next_zero_bit(bitmap, BITMAP_LEN); test_find_last_bit(bitmap, BITMAP_LEN); test_find_first_bit(bitmap, BITMAP_LEN); + test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); pr_err("\nStart testing find_bit() with sparse bitmap\n"); bitmap_zero(bitmap, BITMAP_LEN); + bitmap_zero(bitmap2, BITMAP_LEN); - while (nbits--) + while (nbits--) { __set_bit(prandom_u32() % BITMAP_LEN, bitmap); + __set_bit(prandom_u32() % BITMAP_LEN, bitmap2); + } test_find_next_bit(bitmap, BITMAP_LEN); test_find_next_zero_bit(bitmap, BITMAP_LEN); test_find_last_bit(bitmap, BITMAP_LEN); test_find_first_bit(bitmap, BITMAP_LEN); + test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); - return 0; + /* + * Everything is OK. Return error just to let user run benchmark + * again without annoying rmmod. + */ + return -EINVAL; } module_init(find_bit_test); -static void __exit test_find_bit_cleanup(void) -{ -} -module_exit(test_find_bit_cleanup); - MODULE_LICENSE("GPL"); diff --git a/lib/idr.c b/lib/idr.c index 2593ce513a18..37c552395b3a 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -300,7 +300,6 @@ int ida_get_new_above(struct ida *ida, int start, int *id) bitmap = this_cpu_xchg(ida_bitmap, NULL); if (!bitmap) return -EAGAIN; - memset(bitmap, 0, sizeof(*bitmap)); bitmap->bitmap[0] = tmp >> RADIX_TREE_EXCEPTIONAL_SHIFT; rcu_assign_pointer(*slot, bitmap); } @@ -333,7 +332,6 @@ int ida_get_new_above(struct ida *ida, int start, int *id) bitmap = this_cpu_xchg(ida_bitmap, NULL); if (!bitmap) return -EAGAIN; - memset(bitmap, 0, sizeof(*bitmap)); __set_bit(bit, bitmap->bitmap); radix_tree_iter_replace(root, &iter, slot, bitmap); } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index c8d55565fafa..553248b64d60 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -2124,7 +2124,7 @@ int ida_pre_get(struct ida *ida, gfp_t gfp) preempt_enable(); if (!this_cpu_read(ida_bitmap)) { - struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp); + struct ida_bitmap *bitmap = kzalloc(sizeof(*bitmap), gfp); if (!bitmap) return 0; if (this_cpu_cmpxchg(ida_bitmap, NULL, bitmap)) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index f87d138e9672..e513459a5601 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -163,6 +163,21 @@ static inline u32 hash_stack(unsigned long *entries, unsigned int size) STACK_HASH_SEED); } +/* Use our own, non-instrumented version of memcmp(). + * + * We actually don't care about the order, just the equality. + */ +static inline +int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, + unsigned int n) +{ + for ( ; n-- ; u1++, u2++) { + if (*u1 != *u2) + return 1; + } + return 0; +} + /* Find a stack that is equal to the one stored in entries in the hash */ static inline struct stack_record *find_stack(struct stack_record *bucket, unsigned long *entries, int size, @@ -173,10 +188,8 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, for (found = bucket; found; found = found->next) { if (found->hash == hash && found->size == size && - !memcmp(entries, found->entries, - size * sizeof(unsigned long))) { + !stackdepot_memcmp(entries, found->entries, size)) return found; - } } return NULL; } diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index aa1f2669bdd5..b3f235baa05d 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -23,7 +23,7 @@ __check_eq_uint(const char *srcfile, unsigned int line, const unsigned int exp_uint, unsigned int x) { if (exp_uint != x) { - pr_warn("[%s:%u] expected %u, got %u\n", + pr_err("[%s:%u] expected %u, got %u\n", srcfile, line, exp_uint, x); return false; } @@ -33,19 +33,13 @@ __check_eq_uint(const char *srcfile, unsigned int line, static bool __init __check_eq_bitmap(const char *srcfile, unsigned int line, - const unsigned long *exp_bmap, unsigned int exp_nbits, - const unsigned long *bmap, unsigned int nbits) + const unsigned long *exp_bmap, const unsigned long *bmap, + unsigned int nbits) { - if (exp_nbits != nbits) { - pr_warn("[%s:%u] bitmap length mismatch: expected %u, got %u\n", - srcfile, line, exp_nbits, nbits); - return false; - } - if (!bitmap_equal(exp_bmap, bmap, nbits)) { pr_warn("[%s:%u] bitmaps contents differ: expected \"%*pbl\", got \"%*pbl\"\n", srcfile, line, - exp_nbits, exp_bmap, nbits, bmap); + nbits, exp_bmap, nbits, bmap); return false; } return true; @@ -69,6 +63,10 @@ __check_eq_pbl(const char *srcfile, unsigned int line, static bool __init __check_eq_u32_array(const char *srcfile, unsigned int line, const u32 *exp_arr, unsigned int exp_len, + const u32 *arr, unsigned int len) __used; +static bool __init +__check_eq_u32_array(const char *srcfile, unsigned int line, + const u32 *exp_arr, unsigned int exp_len, const u32 *arr, unsigned int len) { if (exp_len != len) { @@ -107,7 +105,65 @@ __check_eq_u32_array(const char *srcfile, unsigned int line, #define expect_eq_pbl(...) __expect_eq(pbl, ##__VA_ARGS__) #define expect_eq_u32_array(...) __expect_eq(u32_array, ##__VA_ARGS__) -static void __init test_zero_fill_copy(void) +static void __init test_zero_clear(void) +{ + DECLARE_BITMAP(bmap, 1024); + + /* Known way to set all bits */ + memset(bmap, 0xff, 128); + + expect_eq_pbl("0-22", bmap, 23); + expect_eq_pbl("0-1023", bmap, 1024); + + /* single-word bitmaps */ + bitmap_clear(bmap, 0, 9); + expect_eq_pbl("9-1023", bmap, 1024); + + bitmap_zero(bmap, 35); + expect_eq_pbl("64-1023", bmap, 1024); + + /* cross boundaries operations */ + bitmap_clear(bmap, 79, 19); + expect_eq_pbl("64-78,98-1023", bmap, 1024); + + bitmap_zero(bmap, 115); + expect_eq_pbl("128-1023", bmap, 1024); + + /* Zeroing entire area */ + bitmap_zero(bmap, 1024); + expect_eq_pbl("", bmap, 1024); +} + +static void __init test_fill_set(void) +{ + DECLARE_BITMAP(bmap, 1024); + + /* Known way to clear all bits */ + memset(bmap, 0x00, 128); + + expect_eq_pbl("", bmap, 23); + expect_eq_pbl("", bmap, 1024); + + /* single-word bitmaps */ + bitmap_set(bmap, 0, 9); + expect_eq_pbl("0-8", bmap, 1024); + + bitmap_fill(bmap, 35); + expect_eq_pbl("0-63", bmap, 1024); + + /* cross boundaries operations */ + bitmap_set(bmap, 79, 19); + expect_eq_pbl("0-63,79-97", bmap, 1024); + + bitmap_fill(bmap, 115); + expect_eq_pbl("0-127", bmap, 1024); + + /* Zeroing entire area */ + bitmap_fill(bmap, 1024); + expect_eq_pbl("0-1023", bmap, 1024); +} + +static void __init test_copy(void) { DECLARE_BITMAP(bmap1, 1024); DECLARE_BITMAP(bmap2, 1024); @@ -116,36 +172,20 @@ static void __init test_zero_fill_copy(void) bitmap_zero(bmap2, 1024); /* single-word bitmaps */ - expect_eq_pbl("", bmap1, 23); - - bitmap_fill(bmap1, 19); - expect_eq_pbl("0-18", bmap1, 1024); - + bitmap_set(bmap1, 0, 19); bitmap_copy(bmap2, bmap1, 23); expect_eq_pbl("0-18", bmap2, 1024); - bitmap_fill(bmap2, 23); - expect_eq_pbl("0-22", bmap2, 1024); - + bitmap_set(bmap2, 0, 23); bitmap_copy(bmap2, bmap1, 23); expect_eq_pbl("0-18", bmap2, 1024); - bitmap_zero(bmap1, 23); - expect_eq_pbl("", bmap1, 1024); - /* multi-word bitmaps */ - bitmap_zero(bmap1, 1024); - expect_eq_pbl("", bmap1, 1024); - - bitmap_fill(bmap1, 109); - expect_eq_pbl("0-108", bmap1, 1024); - + bitmap_set(bmap1, 0, 109); bitmap_copy(bmap2, bmap1, 1024); expect_eq_pbl("0-108", bmap2, 1024); bitmap_fill(bmap2, 1024); - expect_eq_pbl("0-1023", bmap2, 1024); - bitmap_copy(bmap2, bmap1, 1024); expect_eq_pbl("0-108", bmap2, 1024); @@ -160,9 +200,6 @@ static void __init test_zero_fill_copy(void) bitmap_fill(bmap2, 1024); bitmap_copy(bmap2, bmap1, 97); /* ... but aligned on word length */ expect_eq_pbl("0-108,128-1023", bmap2, 1024); - - bitmap_zero(bmap2, 97); /* ... but 0-padded til word length */ - expect_eq_pbl("128-1023", bmap2, 1024); } #define PARSE_TIME 0x1 @@ -255,171 +292,29 @@ static void __init test_bitmap_parselist(void) } } -static void __init test_bitmap_u32_array_conversions(void) +static void __init test_bitmap_arr32(void) { - DECLARE_BITMAP(bmap1, 1024); - DECLARE_BITMAP(bmap2, 1024); - u32 exp_arr[32], arr[32]; - unsigned nbits; - - for (nbits = 0 ; nbits < 257 ; ++nbits) { - const unsigned int used_u32s = DIV_ROUND_UP(nbits, 32); - unsigned int i, rv; - - bitmap_zero(bmap1, nbits); - bitmap_set(bmap1, nbits, 1024 - nbits); /* garbage */ - - memset(arr, 0xff, sizeof(arr)); - rv = bitmap_to_u32array(arr, used_u32s, bmap1, nbits); - expect_eq_uint(nbits, rv); - - memset(exp_arr, 0xff, sizeof(exp_arr)); - memset(exp_arr, 0, used_u32s*sizeof(*exp_arr)); - expect_eq_u32_array(exp_arr, 32, arr, 32); - - bitmap_fill(bmap2, 1024); - rv = bitmap_from_u32array(bmap2, nbits, arr, used_u32s); - expect_eq_uint(nbits, rv); - expect_eq_bitmap(bmap1, 1024, bmap2, 1024); - - for (i = 0 ; i < nbits ; ++i) { - /* - * test conversion bitmap -> u32[] - */ - - bitmap_zero(bmap1, 1024); - __set_bit(i, bmap1); - bitmap_set(bmap1, nbits, 1024 - nbits); /* garbage */ - - memset(arr, 0xff, sizeof(arr)); - rv = bitmap_to_u32array(arr, used_u32s, bmap1, nbits); - expect_eq_uint(nbits, rv); - - /* 1st used u32 words contain expected bit set, the - * remaining words are left unchanged (0xff) - */ - memset(exp_arr, 0xff, sizeof(exp_arr)); - memset(exp_arr, 0, used_u32s*sizeof(*exp_arr)); - exp_arr[i/32] = (1U<<(i%32)); - expect_eq_u32_array(exp_arr, 32, arr, 32); - - - /* same, with longer array to fill - */ - memset(arr, 0xff, sizeof(arr)); - rv = bitmap_to_u32array(arr, 32, bmap1, nbits); - expect_eq_uint(nbits, rv); - - /* 1st used u32 words contain expected bit set, the - * remaining words are all 0s - */ - memset(exp_arr, 0, sizeof(exp_arr)); - exp_arr[i/32] = (1U<<(i%32)); - expect_eq_u32_array(exp_arr, 32, arr, 32); - - /* - * test conversion u32[] -> bitmap - */ - - /* the 1st nbits of bmap2 are identical to - * bmap1, the remaining bits of bmap2 are left - * unchanged (all 1s) - */ - bitmap_fill(bmap2, 1024); - rv = bitmap_from_u32array(bmap2, nbits, - exp_arr, used_u32s); - expect_eq_uint(nbits, rv); - - expect_eq_bitmap(bmap1, 1024, bmap2, 1024); - - /* same, with more bits to fill - */ - memset(arr, 0xff, sizeof(arr)); /* garbage */ - memset(arr, 0, used_u32s*sizeof(u32)); - arr[i/32] = (1U<<(i%32)); - - bitmap_fill(bmap2, 1024); - rv = bitmap_from_u32array(bmap2, 1024, arr, used_u32s); - expect_eq_uint(used_u32s*32, rv); - - /* the 1st nbits of bmap2 are identical to - * bmap1, the remaining bits of bmap2 are cleared - */ - bitmap_zero(bmap1, 1024); - __set_bit(i, bmap1); - expect_eq_bitmap(bmap1, 1024, bmap2, 1024); - - - /* - * test short conversion bitmap -> u32[] (1 - * word too short) - */ - if (used_u32s > 1) { - bitmap_zero(bmap1, 1024); - __set_bit(i, bmap1); - bitmap_set(bmap1, nbits, - 1024 - nbits); /* garbage */ - memset(arr, 0xff, sizeof(arr)); - - rv = bitmap_to_u32array(arr, used_u32s - 1, - bmap1, nbits); - expect_eq_uint((used_u32s - 1)*32, rv); - - /* 1st used u32 words contain expected - * bit set, the remaining words are - * left unchanged (0xff) - */ - memset(exp_arr, 0xff, sizeof(exp_arr)); - memset(exp_arr, 0, - (used_u32s-1)*sizeof(*exp_arr)); - if ((i/32) < (used_u32s - 1)) - exp_arr[i/32] = (1U<<(i%32)); - expect_eq_u32_array(exp_arr, 32, arr, 32); - } - - /* - * test short conversion u32[] -> bitmap (3 - * bits too short) - */ - if (nbits > 3) { - memset(arr, 0xff, sizeof(arr)); /* garbage */ - memset(arr, 0, used_u32s*sizeof(*arr)); - arr[i/32] = (1U<<(i%32)); - - bitmap_zero(bmap1, 1024); - rv = bitmap_from_u32array(bmap1, nbits - 3, - arr, used_u32s); - expect_eq_uint(nbits - 3, rv); - - /* we are expecting the bit < nbits - - * 3 (none otherwise), and the rest of - * bmap1 unchanged (0-filled) - */ - bitmap_zero(bmap2, 1024); - if (i < nbits - 3) - __set_bit(i, bmap2); - expect_eq_bitmap(bmap2, 1024, bmap1, 1024); - - /* do the same with bmap1 initially - * 1-filled - */ - - bitmap_fill(bmap1, 1024); - rv = bitmap_from_u32array(bmap1, nbits - 3, - arr, used_u32s); - expect_eq_uint(nbits - 3, rv); - - /* we are expecting the bit < nbits - - * 3 (none otherwise), and the rest of - * bmap1 unchanged (1-filled) - */ - bitmap_zero(bmap2, 1024); - if (i < nbits - 3) - __set_bit(i, bmap2); - bitmap_set(bmap2, nbits-3, 1024 - nbits + 3); - expect_eq_bitmap(bmap2, 1024, bmap1, 1024); - } - } + unsigned int nbits, next_bit, len = sizeof(exp) * 8; + u32 arr[sizeof(exp) / 4]; + DECLARE_BITMAP(bmap2, len); + + memset(arr, 0xa5, sizeof(arr)); + + for (nbits = 0; nbits < len; ++nbits) { + bitmap_to_arr32(arr, exp, nbits); + bitmap_from_arr32(bmap2, arr, nbits); + expect_eq_bitmap(bmap2, exp, nbits); + + next_bit = find_next_bit(bmap2, + round_up(nbits, BITS_PER_LONG), nbits); + if (next_bit < round_up(nbits, BITS_PER_LONG)) + pr_err("bitmap_copy_arr32(nbits == %d:" + " tail is not safely cleared: %d\n", + nbits, next_bit); + + if (nbits < len - 32) + expect_eq_uint(arr[DIV_ROUND_UP(nbits, 32)], + 0xa5a5a5a5); } } @@ -453,8 +348,10 @@ static void noinline __init test_mem_optimisations(void) static int __init test_bitmap_init(void) { - test_zero_fill_copy(); - test_bitmap_u32_array_conversions(); + test_zero_clear(); + test_fill_set(); + test_copy(); + test_bitmap_arr32(); test_bitmap_parselist(); test_mem_optimisations(); diff --git a/lib/test_kasan.c b/lib/test_kasan.c index ef1a3ac1397e..a808d81b409d 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -94,6 +94,37 @@ static noinline void __init kmalloc_pagealloc_oob_right(void) ptr[size] = 0; kfree(ptr); } + +static noinline void __init kmalloc_pagealloc_uaf(void) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + + pr_info("kmalloc pagealloc allocation: use-after-free\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + kfree(ptr); + ptr[0] = 0; +} + +static noinline void __init kmalloc_pagealloc_invalid_free(void) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + + pr_info("kmalloc pagealloc allocation: invalid-free\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + kfree(ptr + 1); +} #endif static noinline void __init kmalloc_large_oob_right(void) @@ -472,6 +503,74 @@ static noinline void __init use_after_scope_test(void) p[1023] = 1; } +static noinline void __init kasan_alloca_oob_left(void) +{ + volatile int i = 10; + char alloca_array[i]; + char *p = alloca_array - 1; + + pr_info("out-of-bounds to left on alloca\n"); + *(volatile char *)p; +} + +static noinline void __init kasan_alloca_oob_right(void) +{ + volatile int i = 10; + char alloca_array[i]; + char *p = alloca_array + i; + + pr_info("out-of-bounds to right on alloca\n"); + *(volatile char *)p; +} + +static noinline void __init kmem_cache_double_free(void) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); + if (!cache) { + pr_err("Cache allocation failed\n"); + return; + } + pr_info("double-free on heap object\n"); + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + pr_err("Allocation failed\n"); + kmem_cache_destroy(cache); + return; + } + + kmem_cache_free(cache, p); + kmem_cache_free(cache, p); + kmem_cache_destroy(cache); +} + +static noinline void __init kmem_cache_invalid_free(void) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + if (!cache) { + pr_err("Cache allocation failed\n"); + return; + } + pr_info("invalid-free of heap object\n"); + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + pr_err("Allocation failed\n"); + kmem_cache_destroy(cache); + return; + } + + kmem_cache_free(cache, p + 1); + kmem_cache_destroy(cache); +} + static int __init kmalloc_tests_init(void) { /* @@ -485,6 +584,8 @@ static int __init kmalloc_tests_init(void) kmalloc_node_oob_right(); #ifdef CONFIG_SLUB kmalloc_pagealloc_oob_right(); + kmalloc_pagealloc_uaf(); + kmalloc_pagealloc_invalid_free(); #endif kmalloc_large_oob_right(); kmalloc_oob_krealloc_more(); @@ -502,9 +603,13 @@ static int __init kmalloc_tests_init(void) memcg_accounted_kmem_cache(); kasan_stack_oob(); kasan_global_oob(); + kasan_alloca_oob_left(); + kasan_alloca_oob_right(); ksize_unpoisons_memory(); copy_user_test(); use_after_scope_test(); + kmem_cache_double_free(); + kmem_cache_invalid_free(); kasan_restore_multi_shot(multishot); diff --git a/lib/test_sort.c b/lib/test_sort.c index d389c1cc2f6c..385c0ed5202f 100644 --- a/lib/test_sort.c +++ b/lib/test_sort.c @@ -39,5 +39,11 @@ exit: return err; } +static void __exit test_sort_exit(void) +{ +} + module_init(test_sort_init); +module_exit(test_sort_exit); + MODULE_LICENSE("GPL"); diff --git a/lib/ubsan.c b/lib/ubsan.c index fb0409df1bcf..59fee96c29a0 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -141,11 +141,6 @@ static void val_to_string(char *str, size_t size, struct type_descriptor *type, } } -static bool location_is_valid(struct source_location *loc) -{ - return loc->file_name != NULL; -} - static DEFINE_SPINLOCK(report_lock); static void ubsan_prologue(struct source_location *location, @@ -265,14 +260,14 @@ void __ubsan_handle_divrem_overflow(struct overflow_data *data, } EXPORT_SYMBOL(__ubsan_handle_divrem_overflow); -static void handle_null_ptr_deref(struct type_mismatch_data *data) +static void handle_null_ptr_deref(struct type_mismatch_data_common *data) { unsigned long flags; - if (suppress_report(&data->location)) + if (suppress_report(data->location)) return; - ubsan_prologue(&data->location, &flags); + ubsan_prologue(data->location, &flags); pr_err("%s null pointer of type %s\n", type_check_kinds[data->type_check_kind], @@ -281,15 +276,15 @@ static void handle_null_ptr_deref(struct type_mismatch_data *data) ubsan_epilogue(&flags); } -static void handle_missaligned_access(struct type_mismatch_data *data, +static void handle_misaligned_access(struct type_mismatch_data_common *data, unsigned long ptr) { unsigned long flags; - if (suppress_report(&data->location)) + if (suppress_report(data->location)) return; - ubsan_prologue(&data->location, &flags); + ubsan_prologue(data->location, &flags); pr_err("%s misaligned address %p for type %s\n", type_check_kinds[data->type_check_kind], @@ -299,15 +294,15 @@ static void handle_missaligned_access(struct type_mismatch_data *data, ubsan_epilogue(&flags); } -static void handle_object_size_mismatch(struct type_mismatch_data *data, +static void handle_object_size_mismatch(struct type_mismatch_data_common *data, unsigned long ptr) { unsigned long flags; - if (suppress_report(&data->location)) + if (suppress_report(data->location)) return; - ubsan_prologue(&data->location, &flags); + ubsan_prologue(data->location, &flags); pr_err("%s address %p with insufficient space\n", type_check_kinds[data->type_check_kind], (void *) ptr); @@ -315,37 +310,46 @@ static void handle_object_size_mismatch(struct type_mismatch_data *data, ubsan_epilogue(&flags); } -void __ubsan_handle_type_mismatch(struct type_mismatch_data *data, +static void ubsan_type_mismatch_common(struct type_mismatch_data_common *data, unsigned long ptr) { if (!ptr) handle_null_ptr_deref(data); else if (data->alignment && !IS_ALIGNED(ptr, data->alignment)) - handle_missaligned_access(data, ptr); + handle_misaligned_access(data, ptr); else handle_object_size_mismatch(data, ptr); } -EXPORT_SYMBOL(__ubsan_handle_type_mismatch); -void __ubsan_handle_nonnull_return(struct nonnull_return_data *data) +void __ubsan_handle_type_mismatch(struct type_mismatch_data *data, + unsigned long ptr) { - unsigned long flags; - - if (suppress_report(&data->location)) - return; - - ubsan_prologue(&data->location, &flags); + struct type_mismatch_data_common common_data = { + .location = &data->location, + .type = data->type, + .alignment = data->alignment, + .type_check_kind = data->type_check_kind + }; + + ubsan_type_mismatch_common(&common_data, ptr); +} +EXPORT_SYMBOL(__ubsan_handle_type_mismatch); - pr_err("null pointer returned from function declared to never return null\n"); +void __ubsan_handle_type_mismatch_v1(struct type_mismatch_data_v1 *data, + unsigned long ptr) +{ - if (location_is_valid(&data->attr_location)) - print_source_location("returns_nonnull attribute specified in", - &data->attr_location); + struct type_mismatch_data_common common_data = { + .location = &data->location, + .type = data->type, + .alignment = 1UL << data->log_alignment, + .type_check_kind = data->type_check_kind + }; - ubsan_epilogue(&flags); + ubsan_type_mismatch_common(&common_data, ptr); } -EXPORT_SYMBOL(__ubsan_handle_nonnull_return); +EXPORT_SYMBOL(__ubsan_handle_type_mismatch_v1); void __ubsan_handle_vla_bound_not_positive(struct vla_bound_data *data, unsigned long bound) diff --git a/lib/ubsan.h b/lib/ubsan.h index 88f23557edbe..f4d8d0bd4016 100644 --- a/lib/ubsan.h +++ b/lib/ubsan.h @@ -37,15 +37,24 @@ struct type_mismatch_data { unsigned char type_check_kind; }; -struct nonnull_arg_data { +struct type_mismatch_data_v1 { struct source_location location; - struct source_location attr_location; - int arg_index; + struct type_descriptor *type; + unsigned char log_alignment; + unsigned char type_check_kind; +}; + +struct type_mismatch_data_common { + struct source_location *location; + struct type_descriptor *type; + unsigned long alignment; + unsigned char type_check_kind; }; -struct nonnull_return_data { +struct nonnull_arg_data { struct source_location location; struct source_location attr_location; + int arg_index; }; struct vla_bound_data { diff --git a/mm/Kconfig b/mm/Kconfig index 03ff7703d322..c782e8fb7235 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB A sane initial value is 80 MB. -# For architectures that support deferred memory initialisation -config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - bool - config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n - depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - depends on NO_BOOTMEM && MEMORY_HOTPLUG + depends on NO_BOOTMEM depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b5f940ce0143..ffa0c6b9e78a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -231,11 +231,46 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strictlimit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + break; + case 1: + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + break; + default: + return -EINVAL; + } + + return count; +} +static ssize_t strictlimit_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strictlimit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strictlimit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/bootmem.c b/mm/bootmem.c index 6aef64254203..9e197987b67d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -410,7 +410,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, /** * free_bootmem - mark a page range as usable - * @addr: starting physical address of the range + * @physaddr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. @@ -38,6 +38,7 @@ #include <trace/events/cma.h> #include "cma.h" +#include "internal.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; @@ -108,23 +109,25 @@ static int __init cma_activate_area(struct cma *cma) if (!cma->bitmap) return -ENOMEM; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { unsigned j; base_pfn = pfn; + if (!pfn_valid(base_pfn)) + goto err; + + zone = page_zone(pfn_to_page(base_pfn)); for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); + if (!pfn_valid(pfn)) + goto err; + /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. + * In init_cma_reserved_pageblock(), present_pages + * is adjusted with assumption that all pages in + * the pageblock come from a single zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; + goto err; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -138,7 +141,7 @@ static int __init cma_activate_area(struct cma *cma) return 0; -not_in_zone: +err: pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; @@ -148,6 +151,41 @@ not_in_zone: static int __init cma_init_reserved_areas(void) { int i; + struct zone *zone; + pg_data_t *pgdat; + + if (!cma_area_count) + return 0; + + for_each_online_pgdat(pgdat) { + unsigned long start_pfn = UINT_MAX, end_pfn = 0; + + zone = &pgdat->node_zones[ZONE_MOVABLE]; + + /* + * In this case, we cannot adjust the zone range + * since it is now maximum node span and we don't + * know original zone range. + */ + if (populated_zone(zone)) + continue; + + for (i = 0; i < cma_area_count; i++) { + if (pfn_to_nid(cma_areas[i].base_pfn) != + pgdat->node_id) + continue; + + start_pfn = min(start_pfn, cma_areas[i].base_pfn); + end_pfn = max(end_pfn, cma_areas[i].base_pfn + + cma_areas[i].count); + } + + if (!end_pfn) + continue; + + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } for (i = 0; i < cma_area_count; i++) { int ret = cma_activate_area(&cma_areas[i]); @@ -156,9 +194,32 @@ static int __init cma_init_reserved_areas(void) return ret; } + /* + * Reserved pages for ZONE_MOVABLE are now activated and + * this would change ZONE_MOVABLE's managed page counter and + * the other zones' present counter. We need to re-calculate + * various zone information that depends on this initialization. + */ + build_all_zonelists(NULL); + for_each_populated_zone(zone) { + if (zone_idx(zone) == ZONE_MOVABLE) { + zone_pcp_reset(zone); + setup_zone_pageset(zone); + } else + zone_pcp_update(zone); + + set_zone_contiguous(zone); + } + + /* + * We need to re-init per zone wmark by calling + * init_per_zone_wmark_min() but doesn't call here because it is + * registered on core_initcall and it will be called later than us. + */ + return 0; } -core_initcall(cma_init_reserved_areas); +pure_initcall(cma_init_reserved_areas); /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory diff --git a/mm/compaction.c b/mm/compaction.c index 10cd757f1006..0e569e68b022 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1165,8 +1165,7 @@ static void isolate_freepages(struct compact_control *cc) * from the isolated freelists in the block we are migrating to. */ static struct page *compaction_alloc(struct page *migratepage, - unsigned long data, - int **result) + unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct page *freepage; @@ -1450,14 +1449,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. - * ALLOC_CMA is used, as pages in CMA pageblocks are considered - * suitable migration targets */ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - ALLOC_CMA, wmark_target)) + 0, wmark_target)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; @@ -1738,7 +1735,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation - * @mode: The migration mode for async, sync light, or sync migration + * @prio: Determines how hard direct compaction should try to succeed * * This is the main entry point for direct page compaction. */ diff --git a/mm/fadvise.c b/mm/fadvise.c index ec70d6e4b86d..767887f5f3bf 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) */ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; end_index = (endbyte >> PAGE_SHIFT); - if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) { + /* + * The page at end_index will be inclusively discarded according + * by invalidate_mapping_pages(), so subtracting 1 from + * end_index means we will skip the last page. But if endbyte + * is page aligned or is at the end of file, we should not skip + * that page - discarding the last page is safe enough. + */ + if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK && + endbyte != inode->i_size - 1) { /* First page is tricky as 0 - 1 = -1, but pgoff_t * is unsigned, so the end_index >= start_index * check below would be true and we'll discard the whole diff --git a/mm/filemap.c b/mm/filemap.c index ee83baaf855d..693f62212a59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,7 +31,6 @@ #include <linux/blkdev.h> #include <linux/security.h> #include <linux/cpuset.h> -#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> @@ -418,7 +418,7 @@ again: } if (!pte_present(pte)) { - swp_entry_t entry; + swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { if (hmm_vma_walk->fault) @@ -426,8 +426,6 @@ again: continue; } - entry = pte_to_swp_entry(pte); - /* * This is a special swap entry, ignore migration, use * device and report anything else as error. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7ded98d114..13441808eb66 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * pmdp_invalidate() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = *pmd; - pmdp_invalidate(vma, addr, pmd); - - /* - * Recover dirty/young flags. It relies on pmdp_invalidate to not - * corrupt them. - */ - if (pmd_dirty(*pmd)) - entry = pmd_mkdirty(entry); - if (pmd_young(*pmd)) - entry = pmd_mkyoung(entry); + entry = pmdp_invalidate(vma, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) @@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; struct page *page; pgtable_t pgtable; - pmd_t _pmd; - bool young, write, dirty, soft_dirty, pmd_migration = false; + pmd_t old_pmd, _pmd; + bool young, write, soft_dirty, pmd_migration = false; unsigned long addr; int i; @@ -2116,24 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } + /* + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum + * 383 on page 93. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. + */ + old_pmd = pmdp_invalidate(vma, haddr, pmd); + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - pmd_migration = is_pmd_migration_entry(*pmd); + pmd_migration = is_pmd_migration_entry(old_pmd); if (pmd_migration) { swp_entry_t entry; - entry = pmd_to_swp_entry(*pmd); + entry = pmd_to_swp_entry(old_pmd); page = pfn_to_page(swp_offset(entry)); } else #endif - page = pmd_page(*pmd); + page = pmd_page(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - write = pmd_write(*pmd); - young = pmd_young(*pmd); - dirty = pmd_dirty(*pmd); - soft_dirty = pmd_soft_dirty(*pmd); + if (pmd_dirty(old_pmd)) + SetPageDirty(page); + write = pmd_write(old_pmd); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); - pmdp_huge_split_prepare(vma, haddr, pmd); + /* + * Withdraw the table only after we mark the pmd entry invalid. + * This's critical for some architectures (Power). + */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2160,8 +2176,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_mksoft_dirty(entry); } - if (dirty) - SetPageDirty(page + i); pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); @@ -2189,28 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and userland has the - * whole access to the hugepage during the split (which happens in - * place). If we overwrite the pmd with the not-huge version pointing - * to the pte here (which of course we could if all CPUs were bug - * free), userland could trigger a small page size TLB miss on the - * small sized TLB while the hugepage TLB entry is still established in - * the huge TLB. Some CPU doesn't like that. - * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum - * 383 on page 93. Intel should be safe but is also warns that it's - * only safe if the permission and cache attributes of the two entries - * loaded in the two TLB is identical (which should be the case here). - * But it is generally safer to never allow small and huge TLB entries - * for the same virtual address to be loaded simultaneously. So instead - * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the - * current pmd notpresent (atomically because here the pmd_trans_huge - * and pmd_trans_splitting must remain set at all times on the pmd - * until the split is complete for this pmd), then we flush the SMP TLB - * and finally we write the non-huge version of the pmd entry with - * pmd_populate. - */ - pmdp_invalidate(vma, haddr, pmd); pmd_populate(mm, pmd, pgtable); if (freeze) { @@ -2415,6 +2407,12 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + + /* + * always add to the tail because some iterators expect new + * pages to show after the currently processed elements - e.g. + * migrate_pages + */ lru_add_page_tail(head, page_tail, lruvec, list); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a334f5fb730..7c204e3d132b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,10 +34,9 @@ #include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include <linux/userfaultfd_k.h> +#include <linux/page_owner.h> #include "internal.h" -int hugepages_treat_as_movable; - int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -926,7 +925,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_supported(h)) + if (hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, struct hstate *h) +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { unsigned int order = huge_page_order(h); unsigned long nr_pages = 1 << order; @@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - gfp_t gfp_mask; - gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { spin_lock_irqsave(&zone->lock, flags); pfn = ALIGN(zone->zone_start_pfn, nr_pages); @@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); -static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) -{ - struct page *page; - - page = alloc_gigantic_page(nid, h); - if (page) { - prep_compound_gigantic_page(page, huge_page_order(h)); - prep_new_huge_page(h, page, nid); - } - - return page; -} - -static int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) -{ - struct page *page = NULL; - int nr_nodes, node; - - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_gigantic_page_node(h, node); - if (page) - return 1; - } - - return 0; -} - #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } -static inline int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) { return 0; } #endif static void update_and_free_page(struct hstate *h, struct page *page) @@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page) ClearPagePrivate(&page[1]); } +/* + * Internal hugetlb specific page flag. Do not use outside of the hugetlb + * code + */ +static inline bool PageHugeTemporary(struct page *page) +{ + if (!PageHuge(page)) + return false; + + return (unsigned long)page[2].mapping == -1U; +} + +static inline void SetPageHugeTemporary(struct page *page) +{ + page[2].mapping = (void *)-1U; +} + +static inline void ClearPageHugeTemporary(struct page *page) +{ + page[2].mapping = NULL; +} + void free_huge_page(struct page *page) { /* @@ -1284,7 +1276,11 @@ void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid]) { + if (PageHugeTemporary(page)) { + list_del(&page->lru); + ClearPageHugeTemporary(page); + update_and_free_page(h, page); + } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -1306,7 +1302,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1383,41 +1378,70 @@ pgoff_t __basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) +static struct page *alloc_buddy_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { + int order = huge_page_order(h); struct page *page; - page = __alloc_pages_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_RETRY_MAYFAIL|__GFP_NOWARN, - huge_page_order(h)); - if (page) { - prep_new_huge_page(h, page, nid); - } + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); + if (page) + __count_vm_event(HTLB_BUDDY_PGALLOC); + else + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + return page; +} + +/* + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + */ +static struct page *alloc_fresh_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + else + page = alloc_buddy_huge_page(h, gfp_mask, + nid, nmask); + if (!page) + return NULL; + + if (hstate_is_gigantic(h)) + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, page_to_nid(page)); return page; } -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +/* + * Allocates a fresh page to the hugetlb allocator pool in the node interleaved + * manner. + */ +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int nr_nodes, node; - int ret = 0; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page_node(h, node); - if (page) { - ret = 1; + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); + if (page) break; - } } - if (ret) - count_vm_event(HTLB_BUDDY_PGALLOC); - else - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + if (!page) + return 0; - return ret; + put_page(page); /* free it into the hugepage allocator */ + + return 1; } /* @@ -1525,79 +1549,66 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) -{ - int order = huge_page_order(h); - - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); - return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); -} - -static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, +/* + * Allocates a fresh surplus page from the page allocator. + */ +static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page; - unsigned int r_nid; + struct page *page = NULL; if (hstate_is_gigantic(h)) return NULL; + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) + goto out_unlock; + spin_unlock(&hugetlb_lock); + + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + if (!page) + return NULL; + + spin_lock(&hugetlb_lock); /* - * Assume we will successfully allocate the surplus page to - * prevent racing processes from causing the surplus to exceed - * overcommit - * - * This however introduces a different race, where a process B - * tries to grow the static hugepage pool while alloc_pages() is - * called by process A. B will only examine the per-node - * counters in determining if surplus huge pages can be - * converted to normal huge pages in adjust_pool_surplus(). A - * won't be able to increment the per-node counter, until the - * lock is dropped by B, but B doesn't drop hugetlb_lock until - * no more huge pages can be converted from surplus to normal - * state (and doesn't try to convert again). Thus, we have a - * case where a surplus huge page exists, the pool is grown, and - * the surplus huge page still exists after, even though it - * should just have been converted to a normal huge page. This - * does not leak memory, though, as the hugepage will be freed - * once it is out of use. It also does not allow the counters to - * go out of whack in adjust_pool_surplus() as we don't modify - * the node values until we've gotten the hugepage and only the - * per-node value is checked there. + * We could have raced with the pool size change. + * Double check that and simply deallocate the new page + * if we would end up overcommiting the surpluses. Abuse + * temporary page to workaround the nasty free_huge_page + * codeflow */ - spin_lock(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - spin_unlock(&hugetlb_lock); - return NULL; + SetPageHugeTemporary(page); + put_page(page); + page = NULL; } else { - h->nr_huge_pages++; h->surplus_huge_pages++; + h->nr_huge_pages_node[page_to_nid(page)]++; } + +out_unlock: spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + return page; +} - spin_lock(&hugetlb_lock); - if (page) { - INIT_LIST_HEAD(&page->lru); - r_nid = page_to_nid(page); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - set_hugetlb_cgroup(page, NULL); - /* - * We incremented the global counters already - */ - h->nr_huge_pages_node[r_nid]++; - h->surplus_huge_pages_node[r_nid]++; - __count_vm_event(HTLB_BUDDY_PGALLOC); - } else { - h->nr_huge_pages--; - h->surplus_huge_pages--; - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); - } - spin_unlock(&hugetlb_lock); +static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + return NULL; + + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + if (!page) + return NULL; + + /* + * We do not account these pages as surplus because they are only + * temporary and will be released properly on the last reference + */ + SetPageHugeTemporary(page); return page; } @@ -1606,7 +1617,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct page *page; @@ -1616,17 +1627,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } -/* - * This allocation function is useful in the context where vma is irrelevant. - * E.g. soft-offlining uses this function because it only cares physical - * address of error page. - */ +/* page migration callback function */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { gfp_t gfp_mask = htlb_alloc_mask(h); @@ -1641,12 +1648,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); + page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); return page; } - +/* page migration callback function */ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask) { @@ -1664,9 +1671,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, } spin_unlock(&hugetlb_lock); - /* No reservations, try to overcommit */ + return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); +} + +/* mempolicy aware migration callback */ +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct page *page; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + page = alloc_huge_page_nodemask(h, node, nodemask); + mpol_cond_put(mpol); - return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask); + return page; } /* @@ -1694,7 +1717,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), + page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; @@ -2031,7 +2054,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); - page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); + page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { @@ -2074,20 +2097,6 @@ out_subpool_put: return ERR_PTR(-ENOSPC); } -/* - * alloc_huge_page()'s wrapper which simply returns the page if allocation - * succeeds, otherwise NULL. This function is called from new_vma_page(), - * where no ERR_VALUE is expected to be returned. - */ -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) -{ - struct page *page = alloc_huge_page(vma, addr, avoid_reserve); - if (IS_ERR(page)) - page = NULL; - return page; -} - int alloc_bootmem_huge_page(struct hstate *h) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h) @@ -2150,6 +2159,8 @@ static void __init gather_bootmem_prealloc(void) prep_compound_huge_page(page, h->order); WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); + put_page(page); /* free it into the hugepage allocator */ + /* * If we had gigantic hugepages allocated at boot time, we need * to restore the 'stolen' pages to totalram_pages in order to @@ -2169,7 +2180,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h, + } else if (!alloc_pool_huge_page(h, &node_states[N_MEMORY])) break; cond_resched(); @@ -2289,7 +2300,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with __alloc_buddy_huge_page() here and be unable + * We might race with alloc_surplus_huge_page() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -2312,10 +2323,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, /* yield cpu to avoid soft lockup */ cond_resched(); - if (hstate_is_gigantic(h)) - ret = alloc_fresh_gigantic_page(h, nodes_allowed); - else - ret = alloc_fresh_huge_page(h, nodes_allowed); + ret = alloc_pool_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -2335,7 +2343,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * __alloc_buddy_huge_page() is checking the global counter, + * alloc_surplus_huge_page() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. @@ -2975,20 +2983,32 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { - struct hstate *h = &default_hstate; + struct hstate *h; + unsigned long total = 0; + if (!hugepages_supported()) return; - seq_printf(m, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %8lu kB\n", - h->nr_huge_pages, - h->free_huge_pages, - h->resv_huge_pages, - h->surplus_huge_pages, - 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); + + for_each_hstate(h) { + unsigned long count = h->nr_huge_pages; + + total += (PAGE_SIZE << huge_page_order(h)) * count; + + if (h == &default_hstate) + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", + count, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + (PAGE_SIZE << huge_page_order(h)) / 1024); + } + + seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); } int hugetlb_report_node_meminfo(int nid, char *buf) @@ -4799,3 +4819,36 @@ void putback_active_hugepage(struct page *page) spin_unlock(&hugetlb_lock); put_page(page); } + +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +{ + struct hstate *h = page_hstate(oldpage); + + hugetlb_cgroup_migrate(oldpage, newpage); + set_page_owner_migrate_reason(newpage, reason); + + /* + * transfer temporary state of the new huge page. This is + * reverse to other transitions because the newpage is going to + * be final while the old one will be freed so it takes over + * the temporary status. + * + * Also note that we have to transfer the per-node surplus state + * here as well otherwise the global surplus count will not match + * the per-node's. + */ + if (PageHugeTemporary(newpage)) { + int old_nid = page_to_nid(oldpage); + int new_nid = page_to_nid(newpage); + + SetPageHugeTemporary(oldpage); + ClearPageHugeTemporary(newpage); + + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages_node[old_nid]) { + h->surplus_huge_pages_node[old_nid]--; + h->surplus_huge_pages_node[new_nid]++; + } + spin_unlock(&hugetlb_lock); + } +} diff --git a/mm/internal.h b/mm/internal.h index e6bd35182dae..62d8c34e63d5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,6 +168,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -495,7 +498,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ enum ttu_flags; struct tlbflush_unmap_batch; @@ -538,4 +540,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); +extern struct page *alloc_new_node_page(struct page *page, unsigned long node); #endif /* __MM_INTERNAL_H */ diff --git a/mm/interval_tree.c b/mm/interval_tree.c index b47664358796..27ddfd29112a 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -18,7 +18,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) { - return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; + return v->vm_pgoff + vma_pages(v) - 1; } INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 405bba487df5..e13d911251e7 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -5,7 +5,7 @@ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some code borrowed from https://github.com/xairy/kasan-prototype by - * Andrey Konovalov <adech.fo@gmail.com> + * Andrey Konovalov <andreyknvl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -489,21 +489,17 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) kasan_kmalloc(cache, object, cache->object_size, flags); } -static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) -{ - unsigned long size = cache->object_size; - unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); - - /* RCU slabs could be legally used after free within the RCU period */ - if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return; - - kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); -} - -bool kasan_slab_free(struct kmem_cache *cache, void *object) +static bool __kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool quarantine) { s8 shadow_byte; + unsigned long rounded_up_size; + + if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != + object)) { + kasan_report_invalid_free(object, ip); + return true; + } /* RCU slabs could be legally used after free within the RCU period */ if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) @@ -511,14 +507,14 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { - kasan_report_double_free(cache, object, - __builtin_return_address(1)); + kasan_report_invalid_free(object, ip); return true; } - kasan_poison_slab_free(cache, object); + rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); - if (unlikely(!(cache->flags & SLAB_KASAN))) + if (!quarantine || unlikely(!(cache->flags & SLAB_KASAN))) return false; set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); @@ -526,6 +522,11 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) return true; } +bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) +{ + return __kasan_slab_free(cache, object, ip, true); +} + void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, gfp_t flags) { @@ -589,25 +590,29 @@ void kasan_krealloc(const void *object, size_t size, gfp_t flags) kasan_kmalloc(page->slab_cache, object, size, flags); } -void kasan_poison_kfree(void *ptr) +void kasan_poison_kfree(void *ptr, unsigned long ip) { struct page *page; page = virt_to_head_page(ptr); - if (unlikely(!PageSlab(page))) + if (unlikely(!PageSlab(page))) { + if (ptr != page_address(page)) { + kasan_report_invalid_free(ptr, ip); + return; + } kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), KASAN_FREE_PAGE); - else - kasan_poison_slab_free(page->slab_cache, ptr); + } else { + __kasan_slab_free(page->slab_cache, ptr, ip, false); + } } -void kasan_kfree_large(const void *ptr) +void kasan_kfree_large(void *ptr, unsigned long ip) { - struct page *page = virt_to_page(ptr); - - kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), - KASAN_FREE_PAGE); + if (ptr != page_address(virt_to_head_page(ptr))) + kasan_report_invalid_free(ptr, ip); + /* The object will be poisoned by page_alloc. */ } int kasan_module_alloc(void *addr, size_t size) @@ -736,6 +741,55 @@ void __asan_unpoison_stack_memory(const void *addr, size_t size) } EXPORT_SYMBOL(__asan_unpoison_stack_memory); +/* Emitted by compiler to poison alloca()ed objects. */ +void __asan_alloca_poison(unsigned long addr, size_t size) +{ + size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - + rounded_up_size; + size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE); + + const void *left_redzone = (const void *)(addr - + KASAN_ALLOCA_REDZONE_SIZE); + const void *right_redzone = (const void *)(addr + rounded_up_size); + + WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); + + kasan_unpoison_shadow((const void *)(addr + rounded_down_size), + size - rounded_down_size); + kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_LEFT); + kasan_poison_shadow(right_redzone, + padding_size + KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_RIGHT); +} +EXPORT_SYMBOL(__asan_alloca_poison); + +/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ +void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) +{ + if (unlikely(!stack_top || stack_top > stack_bottom)) + return; + + kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); +} +EXPORT_SYMBOL(__asan_allocas_unpoison); + +/* Emitted by the compiler to [un]poison local variables. */ +#define DEFINE_ASAN_SET_SHADOW(byte) \ + void __asan_set_shadow_##byte(const void *addr, size_t size) \ + { \ + __memset((void *)addr, 0x##byte, size); \ + } \ + EXPORT_SYMBOL(__asan_set_shadow_##byte) + +DEFINE_ASAN_SET_SHADOW(00); +DEFINE_ASAN_SET_SHADOW(f1); +DEFINE_ASAN_SET_SHADOW(f2); +DEFINE_ASAN_SET_SHADOW(f3); +DEFINE_ASAN_SET_SHADOW(f5); +DEFINE_ASAN_SET_SHADOW(f8); + #ifdef CONFIG_MEMORY_HOTPLUG static int __meminit kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c70851a9a6a4..c12dcfde2ebd 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -24,6 +24,14 @@ #define KASAN_STACK_PARTIAL 0xF4 #define KASAN_USE_AFTER_SCOPE 0xF8 +/* + * alloca redzone shadow values + */ +#define KASAN_ALLOCA_LEFT 0xCA +#define KASAN_ALLOCA_RIGHT 0xCB + +#define KASAN_ALLOCA_REDZONE_SIZE 32 + /* Don't break randconfig/all*config builds */ #ifndef KASAN_ABI_VERSION #define KASAN_ABI_VERSION 1 @@ -99,8 +107,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -void kasan_report_double_free(struct kmem_cache *cache, void *object, - void *ip); +void kasan_report_invalid_free(void *object, unsigned long ip); #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); @@ -113,4 +120,48 @@ static inline void quarantine_reduce(void) { } static inline void quarantine_remove_cache(struct kmem_cache *cache) { } #endif +/* + * Exported functions for interfaces called from assembly or from generated + * code. Declarations here to avoid warning about missing declarations. + */ +asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); +void __asan_register_globals(struct kasan_global *globals, size_t size); +void __asan_unregister_globals(struct kasan_global *globals, size_t size); +void __asan_loadN(unsigned long addr, size_t size); +void __asan_storeN(unsigned long addr, size_t size); +void __asan_handle_no_return(void); +void __asan_poison_stack_memory(const void *addr, size_t size); +void __asan_unpoison_stack_memory(const void *addr, size_t size); +void __asan_alloca_poison(unsigned long addr, size_t size); +void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); + +void __asan_load1(unsigned long addr); +void __asan_store1(unsigned long addr); +void __asan_load2(unsigned long addr); +void __asan_store2(unsigned long addr); +void __asan_load4(unsigned long addr); +void __asan_store4(unsigned long addr); +void __asan_load8(unsigned long addr); +void __asan_store8(unsigned long addr); +void __asan_load16(unsigned long addr); +void __asan_store16(unsigned long addr); + +void __asan_load1_noabort(unsigned long addr); +void __asan_store1_noabort(unsigned long addr); +void __asan_load2_noabort(unsigned long addr); +void __asan_store2_noabort(unsigned long addr); +void __asan_load4_noabort(unsigned long addr); +void __asan_store4_noabort(unsigned long addr); +void __asan_load8_noabort(unsigned long addr); +void __asan_store8_noabort(unsigned long addr); +void __asan_load16_noabort(unsigned long addr); +void __asan_store16_noabort(unsigned long addr); + +void __asan_set_shadow_00(const void *addr, size_t size); +void __asan_set_shadow_f1(const void *addr, size_t size); +void __asan_set_shadow_f2(const void *addr, size_t size); +void __asan_set_shadow_f3(const void *addr, size_t size); +void __asan_set_shadow_f5(const void *addr, size_t size); +void __asan_set_shadow_f8(const void *addr, size_t size); + #endif diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 410c8235e671..5c169aa688fd 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -5,7 +5,7 @@ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some code borrowed from https://github.com/xairy/kasan-prototype by - * Andrey Konovalov <adech.fo@gmail.com> + * Andrey Konovalov <andreyknvl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -102,6 +102,10 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) case KASAN_USE_AFTER_SCOPE: bug_type = "use-after-scope"; break; + case KASAN_ALLOCA_LEFT: + case KASAN_ALLOCA_RIGHT: + bug_type = "alloca-out-of-bounds"; + break; } return bug_type; @@ -322,13 +326,12 @@ static void print_shadow_for_address(const void *addr) } } -void kasan_report_double_free(struct kmem_cache *cache, void *object, - void *ip) +void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; kasan_start_report(&flags); - pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); pr_err("\n"); print_address_description(object); pr_err("\n"); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ea4ff259b671..b7e2268dfc9a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1399,8 +1399,7 @@ static void collapse_shmem(struct mm_struct *mm, } if (page_mapped(page)) - unmap_mapping_range(mapping, index << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, index, 1, false); spin_lock_irq(&mapping->tree_lock); @@ -1674,10 +1673,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, spin_unlock(&khugepaged_mm_lock); mm = mm_slot->mm; - down_read(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) - vma = NULL; - else + /* + * Don't wait for semaphore (to avoid long wait times). Just move to + * the next mm on the list. + */ + vma = NULL; + if (unlikely(!down_read_trylock(&mm->mmap_sem))) + goto breakouterloop_mmap_sem; + if (likely(!khugepaged_test_exit(mm))) vma = find_vma(mm, khugepaged_scan.address); progress++; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f656ca27f6c2..e83987c55a08 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -91,7 +91,6 @@ #include <linux/stacktrace.h> #include <linux/cache.h> #include <linux/percpu.h> -#include <linux/hardirq.h> #include <linux/bootmem.h> #include <linux/pfn.h> #include <linux/mmzone.h> @@ -2302,7 +2302,7 @@ next_mm: /** * ksm_do_scan - the ksm scanner main worker function. - * @scan_npages - number of pages we want to scan before we return. + * @scan_npages: number of pages we want to scan before we return. */ static void ksm_do_scan(unsigned int scan_npages) { diff --git a/mm/list_lru.c b/mm/list_lru.c index fd41e969ede5..d9c84c5bda1d 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> +#include <linux/prefetch.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> @@ -52,14 +53,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru) static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) { + struct list_lru_memcg *memcg_lrus; /* - * The lock protects the array of per cgroup lists from relocation - * (see memcg_update_list_lru_node). + * Either lock or RCU protects the array of per cgroup lists + * from relocation (see memcg_update_list_lru_node). */ - lockdep_assert_held(&nlru->lock); - if (nlru->memcg_lrus && idx >= 0) - return nlru->memcg_lrus->lru[idx]; - + memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, + lockdep_is_held(&nlru->lock)); + if (memcg_lrus && idx >= 0) + return memcg_lrus->lru[idx]; return &nlru->lru; } @@ -132,6 +134,12 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; + /* + * Prefetch the neighboring list entries to reduce lock hold time. + */ + prefetchw(item->prev); + prefetchw(item->next); + spin_lock(&nlru->lock); if (!list_empty(item)) { l = list_lru_from_kmem(nlru, item); @@ -168,10 +176,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru, struct list_lru_one *l; unsigned long count; - spin_lock(&nlru->lock); + rcu_read_lock(); l = list_lru_from_memcg_idx(nlru, memcg_idx); count = l->nr_items; - spin_unlock(&nlru->lock); + rcu_read_unlock(); return count; } @@ -324,24 +332,41 @@ fail: static int memcg_init_list_lru_node(struct list_lru_node *nlru) { + struct list_lru_memcg *memcg_lrus; int size = memcg_nr_cache_ids; - nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL); - if (!nlru->memcg_lrus) + memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + + size * sizeof(void *), GFP_KERNEL); + if (!memcg_lrus) return -ENOMEM; - if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { - kvfree(nlru->memcg_lrus); + if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) { + kvfree(memcg_lrus); return -ENOMEM; } + RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus); return 0; } static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) { - __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); - kvfree(nlru->memcg_lrus); + struct list_lru_memcg *memcg_lrus; + /* + * This is called when shrinker has already been unregistered, + * and nobody can use it. So, there is no need to use kvfree_rcu(). + */ + memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); + __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); + kvfree(memcg_lrus); +} + +static void kvfree_rcu(struct rcu_head *head) +{ + struct list_lru_memcg *mlru; + + mlru = container_of(head, struct list_lru_memcg, rcu); + kvfree(mlru); } static int memcg_update_list_lru_node(struct list_lru_node *nlru, @@ -351,8 +376,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, BUG_ON(old_size > new_size); - old = nlru->memcg_lrus; - new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL); + old = rcu_dereference_protected(nlru->memcg_lrus, + lockdep_is_held(&list_lrus_mutex)); + new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; @@ -361,29 +387,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, return -ENOMEM; } - memcpy(new, old, old_size * sizeof(void *)); + memcpy(&new->lru, &old->lru, old_size * sizeof(void *)); /* - * The lock guarantees that we won't race with a reader - * (see list_lru_from_memcg_idx). + * The locking below allows readers that hold nlru->lock avoid taking + * rcu_read_lock (see list_lru_from_memcg_idx). * * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); - nlru->memcg_lrus = new; + rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - kvfree(old); + call_rcu(&old->rcu, kvfree_rcu); return 0; } static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, int old_size, int new_size) { + struct list_lru_memcg *memcg_lrus; + + memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, + lockdep_is_held(&list_lrus_mutex)); /* do not bother shrinking the array back to the old size, because we * cannot handle allocation failures here */ - __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); + __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size); } static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) diff --git a/mm/maccess.c b/mm/maccess.c index 78f9274dd49d..ec00be51a24f 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_write); * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. - * @src: Unsafe address. + * @unsafe_addr: Unsafe address. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from unsafe address to kernel buffer. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9011997d8a5c..88c1af32fd67 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } -/* - * Return page count for single (non recursive) @memcg. - * - * Implementation Note: reading percpu statistics for memcg. - * - * Both of vmstat[] and percpu_counter has threshold and do periodic - * synchronization to implement "quick" read. There are trade-off between - * reading cost and precision of value. Then, we may have a chance to implement - * a periodic synchronization of counter in memcg's counter. - * - * But this _read() function is used for user interface now. The user accounts - * memory usage by memory cgroup and he _always_ requires exact value because - * he accounts memory. Even if we provide quick-and-fuzzy read, we always - * have to visit all online cpus and make sum. So, for now, unnecessary - * synchronization is not implemented. (just implemented for cpu hotplug) - * - * If there are kernel internal actions which can make use of some not-exact - * value, and reading all cpu value can be performance bottleneck in some - * common workload, threshold and synchronization as vmstat[] should be - * implemented. - * - * The parameter idx can be of type enum memcg_event_item or vm_event_item. - */ - static unsigned long memcg_sum_events(struct mem_cgroup *memcg, int event) { - unsigned long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->events[event], cpu); - return val; + return atomic_long_read(&memcg->events[event]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -586,27 +557,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) - __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages); + __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); else { - __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages); + __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); if (PageSwapBacked(page)) - __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages); + __mod_memcg_state(memcg, NR_SHMEM, nr_pages); } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages); + __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(memcg->stat->events[PGPGIN]); + __count_memcg_events(memcg, PGPGIN, 1); else { - __this_cpu_inc(memcg->stat->events[PGPGOUT]); + __count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); + __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, @@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat->nr_page_events); - next = __this_cpu_read(memcg->stat->targets[target]); + val = __this_cpu_read(memcg->stat_cpu->nr_page_events); + next = __this_cpu_read(memcg->stat_cpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { @@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->stat->targets[target], next); + __this_cpu_write(memcg->stat_cpu->targets[target], next); return true; } return false; @@ -917,7 +888,8 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * value, the function breaks the iteration loop and returns the value. * Otherwise, it will iterate over all tasks and return 0. * - * This function must not be called for the root memory cgroup. + * If memcg is the root memory cgroup, this function will iterate only + * over tasks belonging directly to the root memory cgroup. */ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) @@ -925,8 +897,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct mem_cgroup *iter; int ret = 0; - BUG_ON(memcg == root_mem_cgroup); - for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -935,7 +905,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); - if (ret) { + if (ret || memcg == root_mem_cgroup) { mem_cgroup_iter_break(memcg, iter); break; } @@ -946,7 +916,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, /** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page - * @zone: zone of the page + * @pgdat: pgdat of the page * * This function is only safe when following the LRU page isolation * and putback protocol: the LRU lock must be held, and the page must @@ -1124,7 +1094,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -unsigned int memcg1_stats[] = { +static const unsigned int memcg1_stats[] = { MEMCG_CACHE, MEMCG_RSS, MEMCG_RSS_HUGE, @@ -1206,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) } /* - * This function returns the number of memcg under hierarchy tree. Returns - * 1(self count) if no children. - */ -static int mem_cgroup_count_children(struct mem_cgroup *memcg) -{ - int num = 0; - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) - num++; - return num; -} - -/* * Return the memory (and swap, if configured) limit for a memcg. */ unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) @@ -1707,11 +1663,6 @@ void unlock_page_memcg(struct page *page) } EXPORT_SYMBOL(unlock_page_memcg); -/* - * size of first charge trial. "32" comes from vmscan.c's magic value. - * TODO: maybe necessary to use big numbers in big irons. - */ -#define CHARGE_BATCH 32U struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1739,7 +1690,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) unsigned long flags; bool ret = false; - if (nr_pages > CHARGE_BATCH) + if (nr_pages > MEMCG_CHARGE_BATCH) return ret; local_irq_save(flags); @@ -1808,7 +1759,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) } stock->nr_pages += nr_pages; - if (stock->nr_pages > CHARGE_BATCH) + if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); local_irq_restore(flags); @@ -1858,9 +1809,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; + struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); + + for_each_mem_cgroup(memcg) { + int i; + + for (i = 0; i < MEMCG_NR_STAT; i++) { + int nid; + long x; + + x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); + if (x) + atomic_long_add(x, &memcg->stat[i]); + + if (i >= NR_VM_NODE_STAT_ITEMS) + continue; + + for_each_node(nid) { + struct mem_cgroup_per_node *pn; + + pn = mem_cgroup_nodeinfo(memcg, nid); + x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); + if (x) + atomic_long_add(x, &pn->lruvec_stat[i]); + } + } + + for (i = 0; i < MEMCG_NR_EVENTS; i++) { + long x; + + x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); + if (x) + atomic_long_add(x, &memcg->events[i]); + } + } + return 0; } @@ -1881,7 +1867,7 @@ static void high_work_func(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); } /* @@ -1905,7 +1891,7 @@ void mem_cgroup_handle_over_high(void) static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { - unsigned int batch = max(CHARGE_BATCH, nr_pages); + unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; @@ -2415,18 +2401,11 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE], - HPAGE_PMD_NR); + __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_MEMCG_SWAP -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - int nr_entries) -{ - this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries); -} - /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -2450,8 +2429,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, -1); - mem_cgroup_swap_statistics(to, 1); + mod_memcg_state(from, MEMCG_SWAP, -1); + mod_memcg_state(to, MEMCG_SWAP, 1); return 0; } return -EINVAL; @@ -2467,23 +2446,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit) + unsigned long limit, bool memsw) { - unsigned long curusage; - unsigned long oldusage; bool enlarge = false; - int retry_count; int ret; - - /* - * For keeping hierarchical_reclaim simple, how long we should retry - * is depends on callers. We set our retry-count to be function - * of # of children which we should visit in this loop. - */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * - mem_cgroup_count_children(memcg); - - oldusage = page_counter_read(&memcg->memory); + bool limits_invariant; + struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; do { if (signal_pending(current)) { @@ -2492,79 +2460,31 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, } mutex_lock(&memcg_limit_mutex); - if (limit > memcg->memsw.limit) { + /* + * Make sure that the new limit (memsw or memory limit) doesn't + * break our basic invariant rule memory.limit <= memsw.limit. + */ + limits_invariant = memsw ? limit >= memcg->memory.limit : + limit <= memcg->memsw.limit; + if (!limits_invariant) { mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; break; } - if (limit > memcg->memory.limit) + if (limit > counter->limit) enlarge = true; - ret = page_counter_limit(&memcg->memory, limit); + ret = page_counter_limit(counter, limit); mutex_unlock(&memcg_limit_mutex); if (!ret) break; - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); - - curusage = page_counter_read(&memcg->memory); - /* Usage is reduced ? */ - if (curusage >= oldusage) - retry_count--; - else - oldusage = curusage; - } while (retry_count); - - if (!ret && enlarge) - memcg_oom_recover(memcg); - - return ret; -} - -static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long limit) -{ - unsigned long curusage; - unsigned long oldusage; - bool enlarge = false; - int retry_count; - int ret; - - /* see mem_cgroup_resize_res_limit */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * - mem_cgroup_count_children(memcg); - - oldusage = page_counter_read(&memcg->memsw); - - do { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - mutex_lock(&memcg_limit_mutex); - if (limit < memcg->memory.limit) { - mutex_unlock(&memcg_limit_mutex); - ret = -EINVAL; + if (!try_to_free_mem_cgroup_pages(memcg, 1, + GFP_KERNEL, !memsw)) { + ret = -EBUSY; break; } - if (limit > memcg->memsw.limit) - enlarge = true; - ret = page_counter_limit(&memcg->memsw, limit); - mutex_unlock(&memcg_limit_mutex); - - if (!ret) - break; - - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); - - curusage = page_counter_read(&memcg->memsw); - /* Usage is reduced ? */ - if (curusage >= oldusage) - retry_count--; - else - oldusage = curusage; - } while (retry_count); + } while (true); if (!ret && enlarge) memcg_oom_recover(memcg); @@ -2671,6 +2591,224 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) return ret; } +static long memcg_oom_badness(struct mem_cgroup *memcg, + const nodemask_t *nodemask, + unsigned long totalpages) +{ + long points = 0; + int nid; + pg_data_t *pgdat; + + for_each_node_state(nid, N_MEMORY) { + if (nodemask && !node_isset(nid, *nodemask)) + continue; + + points += mem_cgroup_node_nr_lru_pages(memcg, nid, + LRU_ALL_ANON | BIT(LRU_UNEVICTABLE)); + + pgdat = NODE_DATA(nid); + points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg), + NR_SLAB_UNRECLAIMABLE); + } + + points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) / + (PAGE_SIZE / 1024); + points += memcg_page_state(memcg, MEMCG_SOCK); + points += memcg_page_state(memcg, MEMCG_SWAP); + + return points; +} + +/* + * Checks if the given memcg is a valid OOM victim and returns a number, + * which means the folowing: + * -1: there are inflight OOM victim tasks, belonging to the memcg + * 0: memcg is not eligible, e.g. all belonging tasks are protected + * by oom_score_adj set to OOM_SCORE_ADJ_MIN + * >0: memcg is eligible, and the returned value is an estimation + * of the memory footprint + */ +static long oom_evaluate_memcg(struct mem_cgroup *memcg, + const nodemask_t *nodemask, + unsigned long totalpages) +{ + struct css_task_iter it; + struct task_struct *task; + int eligible = 0; + + /* + * Root memory cgroup is a special case: + * we don't have necessary stats to evaluate it exactly as + * leaf memory cgroups, so we approximate it's oom_score + * by summing oom_score of all belonging tasks, which are + * owners of their mm structs. + * + * If there are inflight OOM victim tasks inside + * the root memcg, we return -1. + */ + if (memcg == root_mem_cgroup) { + struct css_task_iter it; + struct task_struct *task; + long score = 0; + + css_task_iter_start(&memcg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (tsk_is_oom_victim(task) && + !test_bit(MMF_OOM_SKIP, + &task->signal->oom_mm->flags)) { + score = -1; + break; + } + + task_lock(task); + if (!task->mm || task->mm->owner != task) { + task_unlock(task); + continue; + } + task_unlock(task); + + score += oom_badness(task, memcg, nodemask, + totalpages); + } + css_task_iter_end(&it); + + return score; + } + + /* + * Memcg is OOM eligible if there are OOM killable tasks inside. + * + * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN + * as unkillable. + * + * If there are inflight OOM victim tasks inside the memcg, + * we return -1. + */ + css_task_iter_start(&memcg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (!eligible && + task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) + eligible = 1; + + if (tsk_is_oom_victim(task) && + !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { + eligible = -1; + break; + } + } + css_task_iter_end(&it); + + if (eligible <= 0) + return eligible; + + return memcg_oom_badness(memcg, nodemask, totalpages); +} + +static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) +{ + struct mem_cgroup *iter, *group = NULL; + long group_score = 0; + + oc->chosen_memcg = NULL; + oc->chosen_points = 0; + + /* + * If OOM is memcg-wide, and the memcg has the oom_group flag set, + * all tasks belonging to the memcg should be killed. + * So, we mark the memcg as a victim. + */ + if (oc->memcg && mem_cgroup_oom_group(oc->memcg)) { + oc->chosen_memcg = oc->memcg; + css_get(&oc->chosen_memcg->css); + return; + } + + /* + * The oom_score is calculated for leaf memory cgroups (including + * the root memcg). + * Non-leaf oom_group cgroups accumulating score of descendant + * leaf memory cgroups. + */ + rcu_read_lock(); + for_each_mem_cgroup_tree(iter, root) { + long score; + + /* + * We don't consider non-leaf non-oom_group memory cgroups + * as OOM victims. + */ + if (memcg_has_children(iter) && iter != root_mem_cgroup && + !mem_cgroup_oom_group(iter)) + continue; + + /* + * If group is not set or we've ran out of the group's sub-tree, + * we should set group and reset group_score. + */ + if (!group || group == root_mem_cgroup || + !mem_cgroup_is_descendant(iter, group)) { + group = iter; + group_score = 0; + } + + if (memcg_has_children(iter) && iter != root_mem_cgroup) + continue; + + score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages); + + /* + * Ignore empty and non-eligible memory cgroups. + */ + if (score == 0) + continue; + + /* + * If there are inflight OOM victims, we don't need + * to look further for new victims. + */ + if (score == -1) { + oc->chosen_memcg = INFLIGHT_VICTIM; + mem_cgroup_iter_break(root, iter); + break; + } + + group_score += score; + + if (group_score > oc->chosen_points) { + oc->chosen_points = group_score; + oc->chosen_memcg = group; + } + } + + if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM) + css_get(&oc->chosen_memcg->css); + + rcu_read_unlock(); +} + +bool mem_cgroup_select_oom_victim(struct oom_control *oc) +{ + struct mem_cgroup *root; + + if (mem_cgroup_disabled()) + return false; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return false; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return false; + + if (oc->memcg) + root = oc->memcg; + else + root = root_mem_cgroup; + + select_victim_memcg(root, oc); + + return oc->chosen_memcg; +} + /* * Reclaims as many pages from the given memcg as possible. * @@ -3020,10 +3158,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages); + ret = mem_cgroup_resize_limit(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); + ret = mem_cgroup_resize_limit(memcg, nr_pages, true); break; case _KMEM: ret = memcg_update_kmem_limit(memcg, nr_pages); @@ -4168,8 +4306,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat = alloc_percpu(struct lruvec_stat); - if (!pn->lruvec_stat) { + pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat_cpu) { kfree(pn); return 1; } @@ -4187,7 +4325,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; - free_percpu(pn->lruvec_stat); + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } @@ -4197,7 +4335,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->stat); + free_percpu(memcg->stat_cpu); kfree(memcg); } @@ -4226,8 +4364,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail; - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) + memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat_cpu) goto fail; for_each_node(node) @@ -4584,8 +4722,8 @@ static int mem_cgroup_move_account(struct page *page, spin_lock_irqsave(&from->move_lock, flags); if (!anon && page_mapped(page)) { - __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages); - __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages); + __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages); + __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages); } /* @@ -4597,16 +4735,14 @@ static int mem_cgroup_move_account(struct page *page, struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { - __this_cpu_sub(from->stat->count[NR_FILE_DIRTY], - nr_pages); - __this_cpu_add(to->stat->count[NR_FILE_DIRTY], - nr_pages); + __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages); + __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages); } } if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages); - __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages); + __mod_memcg_state(from, NR_WRITEBACK, -nr_pages); + __mod_memcg_state(to, NR_WRITEBACK, nr_pages); } /* @@ -5268,6 +5404,39 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + bool oom_group = memcg->oom_group; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return -ENOTSUPP; + + seq_printf(m, "%d\n", oom_group); + + return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int oom_group; + int err; + + if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM)) + return -ENOTSUPP; + + err = kstrtoint(strstrip(buf), 0, &oom_group); + if (err) + return err; + + memcg->oom_group = oom_group; + + return nbytes; +} + static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5388,6 +5557,12 @@ static struct cftype memory_files[] = { .write = memory_max_write, }, { + .name = "oom_group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, + { .name = "events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct mem_cgroup, events_file), @@ -5642,12 +5817,12 @@ static void uncharge_batch(const struct uncharge_gather *ug) } local_irq_save(flags); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); - __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem); - __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); - __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); + __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); + __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); + __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); + __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); + __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); @@ -5874,7 +6049,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) if (in_softirq()) gfp_mask = GFP_NOWAIT; - this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); if (try_charge(memcg, gfp_mask, nr_pages) == 0) return true; @@ -5885,8 +6060,8 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) /** * mem_cgroup_uncharge_skmem - uncharge socket memory - * @memcg - memcg to uncharge - * @nr_pages - number of pages to uncharge + * @memcg: memcg to uncharge + * @nr_pages: number of pages to uncharge */ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { @@ -5895,7 +6070,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) return; } - this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); refill_stock(memcg, nr_pages); } @@ -6019,7 +6194,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), nr_entries); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, nr_entries); + mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); page->mem_cgroup = NULL; @@ -6085,7 +6260,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) mem_cgroup_id_get_many(memcg, nr_pages - 1); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); return 0; } @@ -6113,7 +6288,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) else page_counter_uncharge(&memcg->memsw, nr_pages); } - mem_cgroup_swap_statistics(memcg, -nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); } rcu_read_unlock(); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4b80ccee4535..244bb9c22b4f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1473,7 +1473,7 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private, int **x) +static struct page *new_page(struct page *p, unsigned long private) { int nid = page_to_nid(p); diff --git a/mm/memory.c b/mm/memory.c index 9745b4548cb7..56dcf41171ac 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -400,10 +400,17 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ -/* tlb_gather_mmu - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @fullmm argument is used when @mm is without - * users and we're going to destroy the full address space (exit/execve). +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * @start: start of the region that will be removed from the page-table + * @end: end of the region that will be removed from the page-table + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @start and @end are set to 0 and -1 + * respectively when @mm is without users and we're going to destroy + * the full address space (exit/execve). */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -2806,8 +2813,37 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } /** + * unmap_mapping_pages() - Unmap pages from processes. + * @mapping: The address space containing pages to be unmapped. + * @start: Index of first page to be unmapped. + * @nr: Number of pages to be unmapped. 0 to unmap to end of file. + * @even_cows: Whether to unmap even private COWed pages. + * + * Unmap the pages in this address space from any userspace process which + * has them mmaped. Generally, you want to remove COWed pages as well when + * a file is being truncated, but not when invalidating pages from the page + * cache. + */ +void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, + pgoff_t nr, bool even_cows) +{ + struct zap_details details = { }; + + details.check_mapping = even_cows ? NULL : mapping; + details.first_index = start; + details.last_index = start + nr - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} + +/** * unmap_mapping_range - unmap the portion of all mmaps in the specified - * address_space corresponding to the specified page range in the underlying + * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. @@ -2825,7 +2861,6 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -2837,16 +2872,7 @@ void unmap_mapping_range(struct address_space *mapping, hlen = ULONG_MAX - hba + 1; } - details.check_mapping = even_cows ? NULL : mapping; - details.first_index = hba; - details.last_index = hba + hlen - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; - - i_mmap_lock_write(mapping); - if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); - i_mmap_unlock_write(mapping); + unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); @@ -3030,7 +3056,6 @@ int do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -3043,6 +3068,7 @@ int do_swap_page(struct vm_fault *vmf) mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); swap_free(entry); if (mem_cgroup_swap_full(page) || diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 12df8a5fadcc..6a9bee33ffa7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -184,7 +184,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, SECTION_INFO); - usemap = __nr_to_section(section_nr)->pageblock_flags; + usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; @@ -200,9 +200,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -210,7 +207,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - usemap = __nr_to_section(section_nr)->pageblock_flags; + usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; @@ -692,6 +689,10 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); + if (PageHWPoison(page)) { + ClearPageReserved(page); + continue; + } (*online_page_callback)(page); onlined_pages++; } @@ -1340,8 +1341,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) return 0; } -static struct page *new_node_page(struct page *page, unsigned long private, - int **result) +static struct page *new_node_page(struct page *page, unsigned long private) { int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; @@ -1384,7 +1384,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } else if (thp_migration_supported() && PageTransHuge(page)) + } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) + hpage_nr_pages(page) - 1; @@ -1634,7 +1634,7 @@ repeat: goto failed_removal; cond_resched(); - lru_add_drain_all_cpuslocked(); + lru_add_drain_all(); drain_all_pages(zone); pfn = scan_movable_pages(start_pfn, end_pfn); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ce44d3ff03d..a8b7d59002e8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -446,15 +446,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, __split_huge_pmd(walk->vma, pmd, addr, false, NULL); goto out; } - if (!thp_migration_supported()) { - get_page(page); - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - goto out; - } if (!queue_pages_required(page, qp)) { ret = 1; goto unlock; @@ -495,7 +486,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; -retry: + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -511,22 +502,6 @@ retry: continue; if (!queue_pages_required(page, qp)) continue; - if (PageTransCompound(page) && !thp_migration_supported()) { - get_page(page); - pte_unmap_unlock(pte, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - /* Failed to split -- skip. */ - if (ret) { - pte = pte_offset_map_lock(walk->mm, pmd, - addr, &ptl); - continue; - } - goto retry; - } - migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); @@ -942,12 +917,13 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, } } -static struct page *new_node_page(struct page *page, unsigned long node, int **x) +/* page allocation callback for NUMA node migration */ +struct page *alloc_new_node_page(struct page *page, unsigned long node) { if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), node); - else if (thp_migration_supported() && PageTransHuge(page)) { + else if (PageTransHuge(page)) { struct page *thp; thp = alloc_pages_node(node, @@ -986,7 +962,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, NULL, dest, + err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1107,7 +1083,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; unsigned long uninitialized_var(address); @@ -1121,9 +1097,9 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) } if (PageHuge(page)) { - BUG_ON(!vma); - return alloc_huge_page_noerr(vma, address, 1); - } else if (thp_migration_supported() && PageTransHuge(page)) { + return alloc_huge_page_vma(page_hstate(compound_head(page)), + vma, address); + } else if (PageTransHuge(page)) { struct page *thp; thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, @@ -1152,7 +1128,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { return NULL; } @@ -1263,6 +1239,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; + unsigned long t; unsigned long nlongs; unsigned long endmask; @@ -1279,13 +1256,17 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, else endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ + /* + * When the user specified more nodes than supported just check + * if the non supported part is all zero. + * + * If maxnode have more longs than MAX_NUMNODES, check + * the bits in that area first. And then go through to + * check the rest bits which equal or bigger than MAX_NUMNODES. + * Otherwise, just check bits [MAX_NUMNODES, maxnode). + */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; if (get_user(t, nmask + k)) return -EFAULT; if (k == nlongs - 1) { @@ -1298,6 +1279,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, endmask = ~0UL; } + if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { + unsigned long valid_mask = endmask; + + valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + if (get_user(t, nmask + nlongs - 1)) + return -EFAULT; + if (t & valid_mask) + return -EINVAL; + } + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) return -EFAULT; nodes_addr(*nodes)[nlongs-1] &= endmask; @@ -1418,10 +1409,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out_put; } - if (!nodes_subset(*new, node_states[N_MEMORY])) { - err = -EINVAL; + task_nodes = cpuset_mems_allowed(current); + nodes_and(*new, *new, task_nodes); + if (nodes_empty(*new)) + goto out_put; + + nodes_and(*new, *new, node_states[N_MEMORY]); + if (nodes_empty(*new)) goto out_put; - } err = security_task_movememory(task); if (err) diff --git a/mm/mempool.c b/mm/mempool.c index 7d8c5a0010a2..5c9dce34719b 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -103,10 +103,10 @@ static inline void poison_element(mempool_t *pool, void *element) } #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ -static void kasan_poison_element(mempool_t *pool, void *element) +static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_poison_kfree(element); + kasan_poison_kfree(element, _RET_IP_); if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } @@ -119,7 +119,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } -static void add_element(mempool_t *pool, void *element) +static __always_inline void add_element(mempool_t *pool, void *element) { BUG_ON(pool->curr_nr >= pool->min_nr); poison_element(pool, element); diff --git a/mm/migrate.c b/mm/migrate.c index 4d0be47a322a..5d0dc7b85f90 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1136,10 +1136,12 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - int *result = NULL; struct page *newpage; - newpage = get_new_page(page, private, &result); + if (!thp_migration_supported() && PageTransHuge(page)) + return -ENOMEM; + + newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; @@ -1160,14 +1162,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { - lock_page(page); - rc = split_huge_page(page); - unlock_page(page); - if (rc) - goto out; - } - rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); @@ -1230,12 +1224,6 @@ put_new: put_page(newpage); } - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(newpage); - } return rc; } @@ -1263,7 +1251,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason) { int rc = -EAGAIN; - int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; @@ -1280,7 +1267,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOSYS; } - new_hpage = get_new_page(hpage, private, &result); + new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; @@ -1323,9 +1310,8 @@ put_anon: put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { - hugetlb_cgroup_migrate(hpage, new_hpage); + move_hugetlb_state(hpage, new_hpage, reason); put_new_page = NULL; - set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); @@ -1345,12 +1331,6 @@ out: else putback_active_hugepage(new_hpage); - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(new_hpage); - } return rc; } @@ -1395,6 +1375,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, retry = 0; list_for_each_entry_safe(page, page2, from, lru) { +retry: cond_resched(); if (PageHuge(page)) @@ -1408,6 +1389,26 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, switch(rc) { case -ENOMEM: + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Head page is retried immediately and tail + * pages are added to the tail of the list so + * we encounter them after the rest of the list + * is processed. + */ + if (PageTransHuge(page)) { + lock_page(page); + rc = split_huge_page_to_list(page, from); + unlock_page(page); + if (!rc) { + list_safe_reset_next(page, page2, lru); + goto retry; + } + } nr_failed++; goto out; case -EAGAIN: @@ -1444,141 +1445,101 @@ out: } #ifdef CONFIG_NUMA -/* - * Move a list of individual pages - */ -struct page_to_node { - unsigned long addr; - struct page *page; - int node; - int status; -}; -static struct page *new_page_node(struct page *p, unsigned long private, - int **result) +static int store_status(int __user *status, int start, int value, int nr) { - struct page_to_node *pm = (struct page_to_node *)private; - - while (pm->node != MAX_NUMNODES && pm->page != p) - pm++; + while (nr-- > 0) { + if (put_user(value, status + start)) + return -EFAULT; + start++; + } - if (pm->node == MAX_NUMNODES) - return NULL; + return 0; +} - *result = &pm->status; +static int do_move_pages_to_node(struct mm_struct *mm, + struct list_head *pagelist, int node) +{ + int err; - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - pm->node); - else if (thp_migration_supported() && PageTransHuge(p)) { - struct page *thp; + if (list_empty(pagelist)) + return 0; - thp = alloc_pages_node(pm->node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, - HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } else - return __alloc_pages_node(pm->node, - GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); + err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, + MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(pagelist); + return err; } /* - * Move a set of pages as indicated in the pm array. The addr - * field must be set to the virtual address of the page to be moved - * and the node number must contain a valid target node. - * The pm array ends with node = MAX_NUMNODES. + * Resolves the given address to a struct page, isolates it from the LRU and + * puts it to the given pagelist. + * Returns -errno if the page cannot be found/isolated or 0 when it has been + * queued or the page doesn't need to be migrated because it is already on + * the target node */ -static int do_move_page_to_node_array(struct mm_struct *mm, - struct page_to_node *pm, - int migrate_all) +static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, + int node, struct list_head *pagelist, bool migrate_all) { + struct vm_area_struct *vma; + struct page *page; + unsigned int follflags; int err; - struct page_to_node *pp; - LIST_HEAD(pagelist); down_read(&mm->mmap_sem); + err = -EFAULT; + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || !vma_migratable(vma)) + goto out; - /* - * Build a list of pages to migrate - */ - for (pp = pm; pp->node != MAX_NUMNODES; pp++) { - struct vm_area_struct *vma; - struct page *page; - struct page *head; - unsigned int follflags; - - err = -EFAULT; - vma = find_vma(mm, pp->addr); - if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) - goto set_status; - - /* FOLL_DUMP to ignore special (like zero) pages */ - follflags = FOLL_GET | FOLL_DUMP; - if (!thp_migration_supported()) - follflags |= FOLL_SPLIT; - page = follow_page(vma, pp->addr, follflags); + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; - err = -ENOENT; - if (!page) - goto set_status; + err = -ENOENT; + if (!page) + goto out; - err = page_to_nid(page); + err = 0; + if (page_to_nid(page) == node) + goto out_putpage; - if (err == pp->node) - /* - * Node already in the right place - */ - goto put_and_set; + err = -EACCES; + if (page_mapcount(page) > 1 && !migrate_all) + goto out_putpage; - err = -EACCES; - if (page_mapcount(page) > 1 && - !migrate_all) - goto put_and_set; - - if (PageHuge(page)) { - if (PageHead(page)) { - isolate_huge_page(page, &pagelist); - err = 0; - pp->page = page; - } - goto put_and_set; + if (PageHuge(page)) { + if (PageHead(page)) { + isolate_huge_page(page, pagelist); + err = 0; } + } else { + struct page *head; - pp->page = compound_head(page); head = compound_head(page); err = isolate_lru_page(head); - if (!err) { - list_add_tail(&head->lru, &pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), - hpage_nr_pages(head)); - } -put_and_set: - /* - * Either remove the duplicate refcount from - * isolate_lru_page() or drop the page ref if it was - * not isolated. - */ - put_page(page); -set_status: - pp->status = err; - } - - err = 0; - if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, NULL, - (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_movable_pages(&pagelist); - } + goto out_putpage; + err = 0; + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); + } +out_putpage: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +out: up_read(&mm->mmap_sem); return err; } @@ -1593,79 +1554,79 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, const int __user *nodes, int __user *status, int flags) { - struct page_to_node *pm; - unsigned long chunk_nr_pages; - unsigned long chunk_start; - int err; - - err = -ENOMEM; - pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); - if (!pm) - goto out; + int current_node = NUMA_NO_NODE; + LIST_HEAD(pagelist); + int start, i; + int err = 0, err1; migrate_prep(); - /* - * Store a chunk of page_to_node array in a page, - * but keep the last one as a marker - */ - chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; - - for (chunk_start = 0; - chunk_start < nr_pages; - chunk_start += chunk_nr_pages) { - int j; - - if (chunk_start + chunk_nr_pages > nr_pages) - chunk_nr_pages = nr_pages - chunk_start; - - /* fill the chunk pm with addrs and nodes from user-space */ - for (j = 0; j < chunk_nr_pages; j++) { - const void __user *p; - int node; - - err = -EFAULT; - if (get_user(p, pages + j + chunk_start)) - goto out_pm; - pm[j].addr = (unsigned long) p; + for (i = start = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; + int node; - if (get_user(node, nodes + j + chunk_start)) - goto out_pm; - - err = -ENODEV; - if (node < 0 || node >= MAX_NUMNODES) - goto out_pm; - - if (!node_state(node, N_MEMORY)) - goto out_pm; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out_pm; + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_flush; + if (get_user(node, nodes + i)) + goto out_flush; + addr = (unsigned long)p; + + err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_flush; + if (!node_state(node, N_MEMORY)) + goto out_flush; - pm[j].node = node; + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_flush; + + if (current_node == NUMA_NO_NODE) { + current_node = node; + start = i; + } else if (node != current_node) { + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + start = i; + current_node = node; } - /* End marker for this chunk */ - pm[chunk_nr_pages].node = MAX_NUMNODES; + /* + * Errors in the page lookup or isolation are not fatal and we simply + * report them via status + */ + err = add_page_for_migration(mm, addr, current_node, + &pagelist, flags & MPOL_MF_MOVE_ALL); + if (!err) + continue; - /* Migrate this chunk */ - err = do_move_page_to_node_array(mm, pm, - flags & MPOL_MF_MOVE_ALL); - if (err < 0) - goto out_pm; + err = store_status(status, i, err, 1); + if (err) + goto out_flush; - /* Return status information */ - for (j = 0; j < chunk_nr_pages; j++) - if (put_user(pm[j].status, status + j + chunk_start)) { - err = -EFAULT; - goto out_pm; - } + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + if (i > start) { + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + } + current_node = NUMA_NO_NODE; } - err = 0; - -out_pm: - free_page((unsigned long)pm); +out_flush: + /* Make sure we do not overwrite the existing error */ + err1 = do_move_pages_to_node(mm, &pagelist, current_node); + if (!err1) + err1 = store_status(status, start, current_node, i - start); + if (!err) + err = err1; out: return err; } @@ -1836,8 +1797,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, } static struct page *alloc_misplaced_dst_page(struct page *page, - unsigned long data, - int **result) + unsigned long data) { int nid = (int) data; struct page *newpage; diff --git a/mm/mlock.c b/mm/mlock.c index f7f54fd2e13f..74e5a6547c3d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -64,6 +64,12 @@ void clear_page_mlock(struct page *page) mod_zone_page_state(page_zone(page), NR_MLOCK, -hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGCLEARED); + /* + * The previous TestClearPageMlocked() corresponds to the smp_mb() + * in __pagevec_lru_add_fn(). + * + * See __pagevec_lru_add_fn for more explanation. + */ if (!isolate_lru_page(page)) { putback_lru_page(page); } else { @@ -157,7 +163,7 @@ static void __munlock_isolation_failed(struct page *page) /** * munlock_vma_page - munlock a vma page - * @page - page to be unlocked, either a normal page or THP page head + * @page: page to be unlocked, either a normal page or THP page head * * returns the size of the page as a page mask (0 for normal page, * HPAGE_PMD_NR - 1 for THP head page) diff --git a/mm/mmap.c b/mm/mmap.c index 9efdc021ad22..4bb038e7984b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1342,6 +1342,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; + /* force arch specific MAP_FIXED handling in get_unmapped_area */ + if (flags & MAP_FIXED_NOREPLACE) + flags |= MAP_FIXED; + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1365,6 +1369,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (flags & MAP_FIXED_NOREPLACE) { + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && vma->vm_start <= addr) + return -EEXIST; + } + if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 96edb33fd09a..eff6b88a993f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); +/* + * Must be called while holding mm->mmap_sem for either read or write. + * The result is guaranteed to be valid until mm->mmap_sem is dropped. + */ +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + int id; + bool ret = false; + + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + + if (!mm_has_notifiers(mm)) + return ret; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (!mn->ops->invalidate_range && + !mn->ops->invalidate_range_start && + !mn->ops->invalidate_range_end) + continue; + + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { + ret = true; + break; + } + } + srcu_read_unlock(&srcu, id); + return ret; +} + static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/mm/mprotect.c b/mm/mprotect.c index 58b629bb70de..e3309fcf586b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -84,6 +84,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page || PageKsm(page)) continue; + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + page_mapcount(page) != 1) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; diff --git a/mm/nommu.c b/mm/nommu.c index 17c00d93de2e..ebb6e618dade 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1788,13 +1788,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, - int even_cows) -{ -} -EXPORT_SYMBOL(unmap_mapping_range); - int filemap_fault(struct vm_fault *vmf) { BUG(); @@ -1843,7 +1836,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, } /** - * @access_remote_vm - access another process' address space + * access_remote_vm - access another process' address space * @mm: the mm_struct of the target address space * @addr: start address to access * @buf: source or destination buffer diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 29f855551efe..8219001708e0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -309,7 +309,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } -static int oom_evaluate_task(struct task_struct *task, void *arg) +int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; unsigned long points; @@ -343,26 +343,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto next; /* Prefer thread group leaders for display purposes */ - if (points == oc->chosen_points && thread_group_leader(oc->chosen)) + if (points == oc->chosen_points && thread_group_leader(oc->chosen_task)) goto next; select: - if (oc->chosen) - put_task_struct(oc->chosen); + if (oc->chosen_task) + put_task_struct(oc->chosen_task); get_task_struct(task); - oc->chosen = task; + oc->chosen_task = task; oc->chosen_points = points; next: return 0; abort: - if (oc->chosen) - put_task_struct(oc->chosen); - oc->chosen = (void *)-1UL; + if (oc->chosen_task) + put_task_struct(oc->chosen_task); + oc->chosen_task = INFLIGHT_VICTIM; return 1; } /* * Simple selection loop. We choose the process with the highest number of - * 'points'. In case scan was aborted, oc->chosen is set to -1. + * 'points'. In case scan was aborted, oc->chosen_task is set to -1. */ static void select_bad_process(struct oom_control *oc) { @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * If the mm has notifiers then we would need to invalidate them around - * unmap_page_range and that is risky because notifiers can sleep and - * what they do is basically undeterministic. So let's have a short + * If the mm has invalidate_{start,end}() notifiers that could block, * sleep to give the oom victim some more time. * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time and add - * mmu_notifier_invalidate_range_{start,end} around unmap_page_range + * notifiers cannot block for unbounded amount of time */ - if (mm_has_notifiers(mm)) { + if (mm_has_blockable_invalidate_notifiers(mm)) { up_read(&mm->mmap_sem); schedule_timeout_idle(HZ); goto unlock_oom; @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end); - unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, - NULL); - tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end); + const unsigned long start = vma->vm_start; + const unsigned long end = vma->vm_end; + + tlb_gather_mmu(&tlb, mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end); + unmap_page_range(&tlb, vma, start, end, NULL); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); } } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", @@ -833,67 +834,22 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void oom_kill_process(struct oom_control *oc, const char *message) +static void __oom_kill_process(struct task_struct *victim) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *p; struct mm_struct *mm; - unsigned int victim_points = 0; - static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); bool can_oom_reap = true; /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just give it access to memory reserves - * so it can die quickly + * __oom_kill_process() is used to kill all tasks belonging to + * the selected memory cgroup, so we should check that we're not + * trying to kill an unkillable task. */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); + if (is_global_init(victim) || (victim->flags & PF_KTHREAD) || + victim->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + put_task_struct(victim); return; } - task_unlock(p); - - if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); p = find_lock_task_mm(victim); if (!p) { @@ -968,6 +924,108 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } #undef K +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *p = oc->chosen_task; + unsigned int points = oc->chosen_points; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(p); + if (task_will_free_mem(p)) { + mark_oom_victim(p); + wake_oom_reaper(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(oc, p); + + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (process_shares_mm(child, p->mm)) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, + oc->memcg, oc->nodemask, oc->totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + __oom_kill_process(victim); +} + +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + get_task_struct(task); + __oom_kill_process(task); + return 0; +} + +static bool oom_kill_memcg_victim(struct oom_control *oc) +{ + if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM) + return oc->chosen_memcg; + + /* + * If memory.oom_group is set, kill all tasks belonging to the sub-tree + * of the chosen memory cgroup, otherwise kill the task with the biggest + * memory footprint. + */ + if (mem_cgroup_oom_group(oc->chosen_memcg)) { + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member, + NULL); + /* We have one or more terminating processes at this point. */ + oc->chosen_task = INFLIGHT_VICTIM; + } else { + oc->chosen_points = 0; + oc->chosen_task = NULL; + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc); + + if (oc->chosen_task == NULL || + oc->chosen_task == INFLIGHT_VICTIM) + goto out; + + __oom_kill_process(oc->chosen_task); + } + +out: + mem_cgroup_put(oc->chosen_memcg); + return oc->chosen_task; +} + /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ @@ -1020,6 +1078,7 @@ bool out_of_memory(struct oom_control *oc) { unsigned long freed = 0; enum oom_constraint constraint = CONSTRAINT_NONE; + bool delay = false; /* if set, delay next allocation attempt */ if (oom_killer_disabled) return false; @@ -1064,27 +1123,37 @@ bool out_of_memory(struct oom_control *oc) current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oc->chosen = current; + oc->chosen_task = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; } + if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) { + delay = true; + goto out; + } + select_bad_process(oc); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { + if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (oc->chosen && oc->chosen != (void *)-1UL) { + if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) { oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); - /* - * Give the killed process a good chance to exit before trying - * to allocate memory again. - */ - schedule_timeout_killable(1); + delay = true; } - return !!oc->chosen; + +out: + /* + * Give the killed process a good chance to exit before trying + * to allocate memory again. + */ + if (delay) + schedule_timeout_killable(1); + + return !!oc->chosen_task; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bb7f163baca..757087db7693 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -204,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order); * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA - 256, + [ZONE_DMA] = 256, #endif #ifdef CONFIG_ZONE_DMA32 - 256, + [ZONE_DMA32] = 256, #endif + [ZONE_NORMAL] = 32, #ifdef CONFIG_HIGHMEM - 32, + [ZONE_HIGHMEM] = 0, #endif - 32, + [ZONE_MOVABLE] = 0, }; EXPORT_SYMBOL(totalram_pages); @@ -293,7 +294,7 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* - * Determine how many pages need to be initialized durig early boot + * Determine how many pages need to be initialized during early boot * (non-deferred initialization). * The value of first_deferred_pfn will be set later, once non-deferred pages * are initialized, but for now set it ULONG_MAX. @@ -344,7 +345,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - /* Always populate low zones for address-contrained allocations */ + /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; @@ -1457,92 +1458,87 @@ static inline void __init pgdat_init_report_one_done(void) } /* - * Helper for deferred_init_range, free the given range, reset the counters, and - * return number of pages freed. + * Returns true if page needs to be initialized or freed to buddy allocator. + * + * First we check if pfn is valid on architectures where it is possible to have + * holes within pageblock_nr_pages. On systems where it is not possible, this + * function is optimized out. + * + * Then, we check if a current large page is valid by only checking the validity + * of the head pfn. + * + * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave + * within a node: a pfn is between start and end of a node, but does not belong + * to this memory node. */ -static inline unsigned long __init __def_free(unsigned long *nr_free, - unsigned long *free_base_pfn, - struct page **page) +static inline bool __init +deferred_pfn_valid(int nid, unsigned long pfn, + struct mminit_pfnnid_cache *nid_init_state) { - unsigned long nr = *nr_free; + if (!pfn_valid_within(pfn)) + return false; + if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + return false; + if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) + return false; + return true; +} - deferred_free_range(*free_base_pfn, nr); - *free_base_pfn = 0; - *nr_free = 0; - *page = NULL; +/* + * Free pages to buddy allocator. Try to free aligned pages in + * pageblock_nr_pages sizes. + */ +static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, + unsigned long end_pfn) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long nr_free = 0; - return nr; + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 0; + } else if (!(pfn & nr_pgmask)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 1; + cond_resched(); + } else { + nr_free++; + } + } + /* Free the last block of pages to allocator */ + deferred_free_range(pfn - nr_free, nr_free); } -static unsigned long __init deferred_init_range(int nid, int zid, - unsigned long start_pfn, - unsigned long end_pfn) +/* + * Initialize struct pages. We minimize pfn page lookups and scheduler checks + * by performing it only once every pageblock_nr_pages. + * Return number of pages initialized. + */ +static unsigned long __init deferred_init_pages(int nid, int zid, + unsigned long pfn, + unsigned long end_pfn) { struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; - unsigned long free_base_pfn = 0; unsigned long nr_pages = 0; - unsigned long nr_free = 0; struct page *page = NULL; - unsigned long pfn; - /* - * First we check if pfn is valid on architectures where it is possible - * to have holes within pageblock_nr_pages. On systems where it is not - * possible, this function is optimized out. - * - * Then, we check if a current large page is valid by only checking the - * validity of the head pfn. - * - * meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not - * belong to this memory node. - * - * Finally, we minimize pfn page lookups and scheduler checks by - * performing it only once every pageblock_nr_pages. - * - * We do it in two loops: first we initialize struct page, than free to - * buddy allocator, becuse while we are freeing pages we can access - * pages that are ahead (computing buddy page in __free_one_page()). - */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + page = NULL; continue; - if ((pfn & nr_pgmask) || pfn_valid(pfn)) { - if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - if (page && (pfn & nr_pgmask)) - page++; - else - page = pfn_to_page(pfn); - __init_single_page(page, pfn, zid, nid); - cond_resched(); - } - } - } - - page = NULL; - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (page && (pfn & nr_pgmask)) { - page++; - nr_free++; - } else { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - free_base_pfn = pfn; - nr_free = 1; cond_resched(); + } else { + page++; } + __init_single_page(page, pfn, zid, nid); + nr_pages++; } - /* Free the last block of pages to allocator */ - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - - return nr_pages; + return (nr_pages); } /* Initialise remaining memory on a node */ @@ -1582,10 +1578,21 @@ static int __init deferred_init_memmap(void *data) } first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + /* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, than free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + */ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_range(nid, zid, spfn, epfn); + nr_pages += deferred_init_pages(nid, zid, spfn, epfn); + } + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + deferred_free_pages(nid, zid, spfn, epfn); } /* Sanity check that the next zone really is unpopulated */ @@ -1628,16 +1635,38 @@ void __init page_alloc_init_late(void) } #ifdef CONFIG_CMA +static void __init adjust_present_page_count(struct page *page, long count) +{ + struct zone *zone = page_zone(page); + + /* We don't need to hold a lock since it is boot-up process */ + zone->present_pages += count; +} + /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { unsigned i = pageblock_nr_pages; + unsigned long pfn = page_to_pfn(page); struct page *p = page; + int nid = page_to_nid(page); + + /* + * ZONE_MOVABLE will steal present pages from other zones by + * changing page links so page_zone() is changed. Before that, + * we need to adjust previous zone's page count first. + */ + adjust_present_page_count(page, -pageblock_nr_pages); do { __ClearPageReserved(p); set_page_count(p, 0); - } while (++p, --i); + + /* Steal pages from other zones */ + set_page_links(p, ZONE_MOVABLE, nid, pfn); + } while (++p, ++pfn, --i); + + adjust_present_page_count(page, pageblock_nr_pages); set_pageblock_migratetype(page, MIGRATE_CMA); @@ -2755,7 +2784,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = min_wmark_pages(zone) + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -3031,12 +3060,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); -#endif - /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead @@ -3063,10 +3086,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } #ifdef CONFIG_CMA - if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + if (!list_empty(&area->free_list[MIGRATE_CMA])) return true; - } #endif if (alloc_harder && !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) @@ -3086,13 +3107,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); - long cma_pages = 0; - -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); -#endif /* * Fast check for order-0 only. If this fails then the reserves @@ -3101,7 +3115,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx]) return true; return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, @@ -3391,7 +3405,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_THISNODE) goto out; - /* Exhausted what can be done so it's blamo time */ + /* Exhausted what can be done so it's blame time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; @@ -3717,10 +3731,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif return alloc_flags; } @@ -4187,9 +4197,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) - *alloc_flags |= ALLOC_CMA; - return true; } @@ -4272,7 +4279,7 @@ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) struct page *page; /* - * __get_free_pages() returns a 32-bit address, which cannot represent + * __get_free_pages() returns a virtual address, which cannot represent * a highmem page */ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); @@ -6070,6 +6077,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; + unsigned long node_end_pfn = 0; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6097,9 +6105,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long movable_size = 0; size = zone->spanned_pages; realsize = freesize = zone->present_pages; + if (zone_end_pfn(zone) > node_end_pfn) + node_end_pfn = zone_end_pfn(zone); + /* * Adjust freesize so that it accounts for how much memory @@ -6148,12 +6160,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone_seqlock_init(zone); zone_pcp_init(zone); - if (!size) + /* + * The size of the CMA area is unknown now so we need to + * prepare the memory for the usemap at maximum. + */ + if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE && + pgdat->node_spanned_pages) { + movable_size = node_end_pfn - pgdat->node_start_pfn; + } + + if (!size && !movable_size) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + if (movable_size) { + zone->zone_start_pfn = pgdat->node_start_pfn; + zone->spanned_pages = movable_size; + setup_usemap(pgdat, zone, + pgdat->node_start_pfn, movable_size); + init_currently_empty_zone(zone, + pgdat->node_start_pfn, movable_size); + } else { + setup_usemap(pgdat, zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + } memmap_init(size, nid, j, zone_start_pfn); } } @@ -6965,13 +6995,15 @@ static void setup_per_zone_lowmem_reserve(void) struct zone *lower_zone; idx--; - - if (sysctl_lowmem_reserve_ratio[idx] < 1) - sysctl_lowmem_reserve_ratio[idx] = 1; - lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = managed_pages / - sysctl_lowmem_reserve_ratio[idx]; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) { + sysctl_lowmem_reserve_ratio[idx] = 0; + lower_zone->lowmem_reserve[j] = 0; + } else { + lower_zone->lowmem_reserve[j] = + managed_pages / sysctl_lowmem_reserve_ratio[idx]; + } managed_pages += lower_zone->managed_pages; } } @@ -7762,7 +7794,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. diff --git a/mm/page_ext.c b/mm/page_ext.c index 2c16216c29b6..5295ef331165 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,7 +59,9 @@ */ static struct page_ext_operations *page_ext_ops[] = { +#ifdef CONFIG_DEBUG_PAGEALLOC &debug_guardpage_ops, +#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 165ed8117bd1..53d801235e22 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -293,8 +293,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, return pfn < end_pfn ? -EBUSY : 0; } -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp) +struct page *alloc_migrate_target(struct page *page, unsigned long private) { return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } diff --git a/mm/page_owner.c b/mm/page_owner.c index 270a8219ccd0..5881468f4253 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } @@ -528,32 +528,30 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { - struct page *page; - struct page_ext *page_ext; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long count = 0; - /* Scan block by block. First and last block may be incomplete */ - pfn = zone->zone_start_pfn; - /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { + unsigned long block_end_pfn; + if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - page = pfn_to_page(pfn); - for (; pfn < block_end_pfn; pfn++) { + struct page *page; + struct page_ext *page_ext; + if (!pfn_valid_within(pfn)) continue; @@ -635,9 +633,7 @@ static int __init pageowner_init(void) dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, NULL, &proc_page_owner_operations); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - return 0; + return PTR_ERR_OR_ZERO(dentry); } late_initcall(pageowner_init) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 23a3e415ac2c..8d2da5dec1e0 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -265,6 +265,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these * callbacks, the associated entries/pages are just ignored. * The return values of these callbacks are commonly defined like below: + * * - 0 : succeeded to handle the current entry, and if you don't reach the * end address yet, continue to walk. * - >0 : succeeded to handle the current entry, and return to the caller diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 1e4ee763c190..cf2af04b34b9 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -181,12 +181,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t entry = *pmdp; - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return old; } #endif diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 8973cd231ece..a447092d4635 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -25,7 +25,7 @@ /** * process_vm_rw_pages - read/write pages from task specified * @pages: array of pointers to pages we want to copy - * @start_offset: offset in page to start copying from/to + * @offset: offset in page to start copying from/to * @len: number of bytes to copy * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to @@ -147,6 +147,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process + * * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. @@ -197,11 +198,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, } /* Get process information */ - rcu_read_lock(); - task = find_task_by_vpid(pid); - if (task) - get_task_struct(task); - rcu_read_unlock(); + task = find_get_task_by_vpid(pid); if (!task) { rc = -ESRCH; goto free_proc_pages; @@ -253,6 +250,7 @@ free_proc_pages: * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process + * * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. diff --git a/mm/shmem.c b/mm/shmem.c index 7fbe67be86fa..1907688b75ee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2717,15 +2717,28 @@ continue_resched: return error; } +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (file->f_op == &shmem_file_operations) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (file->f_op == &hugetlbfs_file_operations) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE) -int shmem_add_seals(struct file *file, unsigned int seals) +static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); + unsigned int *file_seals; int error; /* @@ -2758,8 +2771,6 @@ int shmem_add_seals(struct file *file, unsigned int seals) * other file types. */ - if (file->f_op != &shmem_file_operations) - return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) @@ -2767,12 +2778,18 @@ int shmem_add_seals(struct file *file, unsigned int seals) inode_lock(inode); - if (info->seals & F_SEAL_SEAL) { + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } - if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; @@ -2784,25 +2801,22 @@ int shmem_add_seals(struct file *file, unsigned int seals) } } - info->seals |= seals; + *file_seals |= seals; error = 0; unlock: inode_unlock(inode); return error; } -EXPORT_SYMBOL_GPL(shmem_add_seals); -int shmem_get_seals(struct file *file) +static int memfd_get_seals(struct file *file) { - if (file->f_op != &shmem_file_operations) - return -EINVAL; + unsigned int *seals = memfd_file_seals_ptr(file); - return SHMEM_I(file_inode(file))->seals; + return seals ? *seals : -EINVAL; } -EXPORT_SYMBOL_GPL(shmem_get_seals); -long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { long error; @@ -2812,10 +2826,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (arg > UINT_MAX) return -EINVAL; - error = shmem_add_seals(file, arg); + error = memfd_add_seals(file, arg); break; case F_GET_SEALS: - error = shmem_get_seals(file); + error = memfd_get_seals(file); break; default: error = -EINVAL; @@ -3657,7 +3671,7 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - struct shmem_inode_info *info; + unsigned int *file_seals; struct file *file; int fd, error; char *name; @@ -3667,9 +3681,6 @@ SYSCALL_DEFINE2(memfd_create, if (flags & ~(unsigned int)MFD_ALL_FLAGS) return -EINVAL; } else { - /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ - if (flags & MFD_ALLOW_SEALING) - return -EINVAL; /* Allow huge page size encoding in flags. */ if (flags & ~(unsigned int)(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) @@ -3722,12 +3733,8 @@ SYSCALL_DEFINE2(memfd_create, file->f_flags |= O_RDWR | O_LARGEFILE; if (flags & MFD_ALLOW_SEALING) { - /* - * flags check at beginning of function ensures - * this is not a hugetlbfs (MFD_HUGETLB) file. - */ - info = SHMEM_I(file_inode(file)); - info->seals &= ~F_SEAL_SEAL; + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); diff --git a/mm/slab.c b/mm/slab.c index 68a93a2a44c0..324446621b3e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1317,8 +1317,6 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; - slab_state = UP; - /* 6) resize the head arrays to their final sizes */ mutex_lock(&slab_mutex); list_for_each_entry(cachep, &slab_caches, list) @@ -1354,8 +1352,6 @@ static int __init cpucache_init(void) slab_online_cpu, slab_offline_cpu); WARN_ON(ret < 0); - /* Done! */ - slab_state = FULL; return 0; } __initcall(cpucache_init); @@ -3482,11 +3478,11 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) +static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { /* Put the object into the quarantine, don't touch it for now. */ - if (kasan_slab_free(cachep, objp)) + if (kasan_slab_free(cachep, objp, _RET_IP_)) return; ___cache_free(cachep, objp, caller); diff --git a/mm/slab.h b/mm/slab.h index 1f013f7795c6..51813236e773 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -80,9 +80,6 @@ extern const struct kmalloc_info_struct { unsigned long size; } kmalloc_info[]; -unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size); - #ifndef CONFIG_SLOB /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); diff --git a/mm/slab_common.c b/mm/slab_common.c index d00cd3f0f8ac..10f127b2de7c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -276,6 +276,35 @@ static inline void memcg_unlink_cache(struct kmem_cache *s) #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +static unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign; + + ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + +/* * Find a mergeable slab cache */ int slab_unmergeable(struct kmem_cache *s) @@ -348,33 +377,6 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, return NULL; } -/* - * Figure out what the alignment of the objects will be given a set of - * flags, a user specified alignment and the size of the objects. - */ -unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static struct kmem_cache *create_cache(const char *name, size_t object_size, size_t size, size_t align, slab_flags_t flags, size_t useroffset, diff --git a/mm/slub.c b/mm/slub.c index 862d835b3042..9456ce5e5f66 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -838,6 +838,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) u8 *start; u8 *fault; u8 *end; + u8 *pad; int length; int remainder; @@ -851,8 +852,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; + pad = end - remainder; metadata_access_enable(); - fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + fault = memchr_inv(pad, POISON_INUSE, remainder); metadata_access_disable(); if (!fault) return 1; @@ -860,9 +862,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section(KERN_ERR, "Padding ", end - remainder, remainder); + print_section(KERN_ERR, "Padding ", pad, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); + restore_bytes(s, "slab padding", POISON_INUSE, fault, end); return 0; } @@ -1354,13 +1356,13 @@ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) kasan_kmalloc_large(ptr, size, flags); } -static inline void kfree_hook(const void *x) +static __always_inline void kfree_hook(void *x) { kmemleak_free(x); - kasan_kfree_large(x); + kasan_kfree_large(x, _RET_IP_); } -static inline void *slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) { void *freeptr; @@ -1388,7 +1390,7 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x) * kasan_slab_free() may put x into memory quarantine, delaying its * reuse. In this case the object's freelist pointer is changed. */ - kasan_slab_free(s, x); + kasan_slab_free(s, x, _RET_IP_); return freeptr; } @@ -3910,7 +3912,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kfree_hook(x); + kfree_hook(object); __free_pages(page, compound_order(page)); return; } diff --git a/mm/sparse.c b/mm/sparse.c index 2583174b1d62..7af5e7a92528 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -264,7 +264,11 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, */ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) { - return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + unsigned long coded_mem_map = + (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT)); + BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); + return coded_mem_map; } /* diff --git a/mm/swap.c b/mm/swap.c index 38e1b6374a97..59ad491ada5e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -446,30 +446,6 @@ void lru_cache_add(struct page *page) } /** - * add_page_to_unevictable_list - add a page to the unevictable list - * @page: the page to be added to the unevictable list - * - * Add page directly to its zone's unevictable list. To avoid races with - * tasks that might be making the page evictable, through eg. munlock, - * munmap or exit, while it's not on the lru, we want to add the page - * while it's locked or otherwise "invisible" to other tasks. This is - * difficult to do when using the pagevec cache, so bypass that. - */ -void add_page_to_unevictable_list(struct page *page) -{ - struct pglist_data *pgdat = page_pgdat(page); - struct lruvec *lruvec; - - spin_lock_irq(&pgdat->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - ClearPageActive(page); - SetPageUnevictable(page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); - spin_unlock_irq(&pgdat->lru_lock); -} - -/** * lru_cache_add_active_or_unevictable * @page: the page to be added to LRU * @vma: vma in which page is mapped for determining reclaimability @@ -484,13 +460,9 @@ void lru_cache_add_active_or_unevictable(struct page *page, { VM_BUG_ON_PAGE(PageLRU(page), page); - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) SetPageActive(page); - lru_cache_add(page); - return; - } - - if (!TestSetPageMlocked(page)) { + else if (!TestSetPageMlocked(page)) { /* * We use the irq-unsafe __mod_zone_page_stat because this * counter is not modified from interrupt context, and the pte @@ -500,7 +472,7 @@ void lru_cache_add_active_or_unevictable(struct page *page, hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); } - add_page_to_unevictable_list(page); + lru_cache_add(page); } /* @@ -688,7 +660,14 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -void lru_add_drain_all_cpuslocked(void) +/* + * Doesn't need any cpu hotplug locking because we do rely on per-cpu + * kworkers being shut down before our page_alloc_cpu_dead callback is + * executed on the offlined cpu. + * Calling this function with cpu hotplug locks held can actually lead + * to obscure indirect dependencies via WQ context. + */ +void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; @@ -724,13 +703,6 @@ void lru_add_drain_all_cpuslocked(void) mutex_unlock(&lock); } -void lru_add_drain_all(void) -{ - get_online_cpus(); - lru_add_drain_all_cpuslocked(); - put_online_cpus(); -} - /** * release_pages - batched put_page() * @pages: array of pages to release @@ -886,15 +858,55 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, void *arg) { - int file = page_is_file_cache(page); - int active = PageActive(page); - enum lru_list lru = page_lru(page); + enum lru_list lru; + int was_unevictable = TestClearPageUnevictable(page); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); + /* + * Page becomes evictable in two ways: + * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()]. + * 2) Before acquiring LRU lock to put the page to correct LRU and then + * a) do PageLRU check with lock [check_move_unevictable_pages] + * b) do PageLRU check before lock [clear_page_mlock] + * + * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need + * following strict ordering: + * + * #0: __pagevec_lru_add_fn #1: clear_page_mlock + * + * SetPageLRU() TestClearPageMlocked() + * smp_mb() // explicit ordering // above provides strict + * // ordering + * PageMlocked() PageLRU() + * + * + * if '#1' does not observe setting of PG_lru by '#0' and fails + * isolation, the explicit barrier will make sure that page_evictable + * check will put the page in correct LRU. Without smp_mb(), SetPageLRU + * can be reordered after PageMlocked check and can make '#1' to fail + * the isolation of the page whose Mlocked bit is cleared (#0 is also + * looking at the same page) and the evictable page will be stranded + * in an unevictable LRU. + */ + smp_mb(); + + if (page_evictable(page)) { + lru = page_lru(page); + update_page_reclaim_stat(lruvec, page_is_file_cache(page), + PageActive(page)); + if (was_unevictable) + count_vm_event(UNEVICTABLE_PGRESCUED); + } else { + lru = LRU_UNEVICTABLE; + ClearPageActive(page); + SetPageUnevictable(page); + if (!was_unevictable) + count_vm_event(UNEVICTABLE_PGCULLED); + } + add_page_to_lru_list(page, lruvec, lru); - update_page_reclaim_stat(lruvec, file, active); trace_mm_lru_insertion(page, lru); } @@ -913,11 +925,11 @@ EXPORT_SYMBOL(__pagevec_lru_add); * @pvec: Where the resulting entries are placed * @mapping: The address_space to search * @start: The starting entry index - * @nr_entries: The maximum number of entries + * @nr_pages: The maximum number of pages * @indices: The cache indices corresponding to the entries in @pvec * * pagevec_lookup_entries() will search for and return a group of up - * to @nr_entries pages and shadow entries in the mapping. All + * to @nr_pages pages and shadow entries in the mapping. All * entries are placed in @pvec. pagevec_lookup_entries() takes a * reference against actual pages in @pvec. * diff --git a/mm/swapfile.c b/mm/swapfile.c index 42fe5653814a..006047b16814 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1800,8 +1800,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true, false); @@ -1810,6 +1808,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); /* * Move the page to the active list so it is not diff --git a/mm/truncate.c b/mm/truncate.c index e4b4cf0f4070..c34e2fd4f583 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -179,12 +179,8 @@ static void truncate_cleanup_page(struct address_space *mapping, struct page *page) { if (page_mapped(page)) { - loff_t holelen; - - holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_SHIFT, - holelen, 0); + pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1; + unmap_mapping_pages(mapping, page->index, nr, false); } if (page_has_private(page)) @@ -715,19 +711,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping, /* * Zap the rest of the file in one hit. */ - unmap_mapping_range(mapping, - (loff_t)index << PAGE_SHIFT, - (loff_t)(1 + end - index) - << PAGE_SHIFT, - 0); + unmap_mapping_pages(mapping, index, + (1 + end - index), false); did_range_unmap = 1; } else { /* * Just zap this page */ - unmap_mapping_range(mapping, - (loff_t)index << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, index, + 1, false); } } BUG_ON(page_mapped(page)); @@ -753,8 +745,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * get remapped later. */ if (dax_mapping(mapping)) { - unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT, - (loff_t)(end - start + 1) << PAGE_SHIFT, 0); + unmap_mapping_pages(mapping, start, end - start + 1, false); } out: cleancache_invalidate_inode(mapping); diff --git a/mm/vmscan.c b/mm/vmscan.c index 47d5ced51f2d..8293c19c6582 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -220,22 +220,6 @@ unsigned long zone_reclaimable_pages(struct zone *zone) return nr; } -unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) -{ - unsigned long nr; - - nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + - node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + - node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); - - if (get_nr_swap_pages() > 0) - nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + - node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + - node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); - - return nr; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector @@ -310,9 +294,7 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, - unsigned long nr_scanned, - unsigned long nr_eligible) + struct shrinker *shrinker, int priority) { unsigned long freed = 0; unsigned long long delta; @@ -337,9 +319,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = (4 * nr_scanned) / shrinker->seeks; - delta *= freeable; - do_div(delta, nr_eligible + 1); + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", @@ -373,8 +355,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - nr_scanned, nr_eligible, - freeable, delta, total_scan); + freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one @@ -434,8 +415,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * @gfp_mask: allocation context * @nid: node whose slab caches to target * @memcg: memory cgroup whose slab caches to target - * @nr_scanned: pressure numerator - * @nr_eligible: pressure denominator + * @priority: the reclaim priority * * Call the shrink functions to age shrinkable caches. * @@ -447,20 +427,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * objects from the memory cgroup specified. Otherwise, only unaware * shrinkers are called. * - * @nr_scanned and @nr_eligible form a ratio that indicate how much of - * the available objects should be scanned. Page reclaim for example - * passes the number of pages scanned and the number of pages on the - * LRU lists that it considered on @nid, plus a bias in @nr_scanned - * when it encountered mapped pages. The ratio is further biased by - * the ->seeks setting of the shrink function, which indicates the - * cost to recreate an object relative to that of an LRU page. + * @priority is sc->priority, we take the number of objects and >> by priority + * in order to get the scan target. * * Returns the number of reclaimed slab objects. */ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, - unsigned long nr_scanned, - unsigned long nr_eligible) + int priority) { struct shrinker *shrinker; unsigned long freed = 0; @@ -468,9 +442,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; - if (nr_scanned == 0) - nr_scanned = SWAP_CLUSTER_MAX; - if (!down_read_trylock(&shrinker_rwsem)) { /* * If we would return 0, our callers would understand that we @@ -490,6 +461,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, }; /* + * We are about to die and free our memory. + * Stop shrinking which might delay signal handling. + */ + if (unlikely(fatal_signal_pending(current))) + break; + + /* * If kernel memory accounting is disabled, we ignore * SHRINKER_MEMCG_AWARE flag and call all shrinkers * passing NULL for memcg. @@ -501,7 +479,16 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + freed += do_shrink_slab(&sc, shrinker, priority); + /* + * Bail out if someone want to register a new shrinker to + * prevent the regsitration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } up_read(&shrinker_rwsem); @@ -519,8 +506,7 @@ void drop_slab_node(int nid) freed = 0; do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, - 1000, 1000); + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); } while (freed > 10); } @@ -790,64 +776,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) */ void putback_lru_page(struct page *page) { - bool is_unevictable; - int was_unevictable = PageUnevictable(page); - - VM_BUG_ON_PAGE(PageLRU(page), page); - -redo: - ClearPageUnevictable(page); - - if (page_evictable(page)) { - /* - * For evictable pages, we can use the cache. - * In event of a race, worst case is we end up with an - * unevictable page on [in]active list. - * We know how to handle that. - */ - is_unevictable = false; - lru_cache_add(page); - } else { - /* - * Put unevictable pages directly on zone's unevictable - * list. - */ - is_unevictable = true; - add_page_to_unevictable_list(page); - /* - * When racing with an mlock or AS_UNEVICTABLE clearing - * (page is unlocked) make sure that if the other thread - * does not observe our setting of PG_lru and fails - * isolation/check_move_unevictable_pages, - * we see PG_mlocked/AS_UNEVICTABLE cleared below and move - * the page back to the evictable list. - * - * The other side is TestClearPageMlocked() or shmem_lock(). - */ - smp_mb(); - } - - /* - * page's status can change while we move it among lru. If an evictable - * page is on unevictable list, it never be freed. To avoid that, - * check after we added it to the list, again. - */ - if (is_unevictable && page_evictable(page)) { - if (!isolate_lru_page(page)) { - put_page(page); - goto redo; - } - /* This means someone else dropped this page from LRU - * So, it will be freed or putback to LRU again. There is - * nothing to do here. - */ - } - - if (was_unevictable && !is_unevictable) - count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && is_unevictable) - count_vm_event(UNEVICTABLE_PGCULLED); - + lru_cache_add(page); put_page(page); /* drop ref from isolate */ } @@ -1436,14 +1365,24 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageDirty(page)) { struct address_space *mapping; + bool migrate_dirty; /* * Only pages without mappings or that have a * ->migratepage callback are possible to migrate - * without blocking + * without blocking. However, we can be racing with + * truncation so it's necessary to lock the page + * to stabilise the mapping as truncation holds + * the page lock until after the page is removed + * from the page cache. */ + if (!trylock_page(page)) + return ret; + mapping = page_mapping(page); - if (mapping && !mapping->a_ops->migratepage) + migrate_dirty = mapping && mapping->a_ops->migratepage; + unlock_page(page); + if (!migrate_dirty) return ret; } } @@ -1606,6 +1545,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * found will be decremented. * * Restrictions: + * * (1) Must be called with an elevated refcount on the page. This is a * fundamentnal difference from isolate_lru_pages (which is called * without a stable reference). @@ -2615,14 +2555,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; if (memcg) shrink_slab(sc->gfp_mask, pgdat->node_id, - memcg, sc->nr_scanned - scanned, - lru_pages); + memcg, sc->priority); /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -2646,14 +2584,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); - /* - * Shrink the slab caches in the same proportion that - * the eligible LRU pages were scanned. - */ if (global_reclaim(sc)) shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, - sc->nr_scanned - nr_scanned, - node_lru_pages); + sc->priority); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; diff --git a/mm/z3fold.c b/mm/z3fold.c index 39e19125d6a0..d589d318727f 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -769,7 +769,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) /** * z3fold_reclaim_page() - evicts allocations from a pool page and frees it * @pool: pool from which a page will attempt to be evicted - * @retires: number of pages on the LRU list for which eviction will + * @retries: number of pages on the LRU list for which eviction will * be attempted before failing * * z3fold reclaim is different from normal system reclaim in that it is done @@ -779,7 +779,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) * z3fold and the user, however. * * To avoid these, this is how z3fold_reclaim_page() should be called: - + * * The user detects a page should be reclaimed and calls z3fold_reclaim_page(). * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and * call the user-defined eviction handler with the pool and handle as diff --git a/mm/zbud.c b/mm/zbud.c index b42322e50f63..28458f7d1e84 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -466,7 +466,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) /** * zbud_reclaim_page() - evicts allocations from a pool page and frees it * @pool: pool from which a page will attempt to be evicted - * @retires: number of pages on the LRU list for which eviction will + * @retries: number of pages on the LRU list for which eviction will * be attempted before failing * * zbud reclaim is different from normal system reclaim in that the reclaim is @@ -476,7 +476,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) * the user, however. * * To avoid these, this is how zbud_reclaim_page() should be called: - + * * The user detects a page should be reclaimed and calls zbud_reclaim_page(). * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call * the user-defined eviction handler with the pool and handle as arguments. diff --git a/mm/zpool.c b/mm/zpool.c index fd3ff719c32c..f8cb83e7699b 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -21,6 +21,7 @@ struct zpool { struct zpool_driver *driver; void *pool; const struct zpool_ops *ops; + bool evictable; struct list_head list; }; @@ -100,7 +101,7 @@ static void zpool_put_driver(struct zpool_driver *driver) /** * zpool_has_pool() - Check if the pool driver is available - * @type The type of the zpool to check (e.g. zbud, zsmalloc) + * @type: The type of the zpool to check (e.g. zbud, zsmalloc) * * This checks if the @type pool driver is available. This will try to load * the requested module, if needed, but there is no guarantee the module will @@ -135,14 +136,14 @@ EXPORT_SYMBOL(zpool_has_pool); /** * zpool_create_pool() - Create a new zpool - * @type The type of the zpool to create (e.g. zbud, zsmalloc) - * @name The name of the zpool (e.g. zram0, zswap) - * @gfp The GFP flags to use when allocating the pool. - * @ops The optional ops callback. + * @type: The type of the zpool to create (e.g. zbud, zsmalloc) + * @name: The name of the zpool (e.g. zram0, zswap) + * @gfp: The GFP flags to use when allocating the pool. + * @ops: The optional ops callback. * * This creates a new zpool of the specified type. The gfp flags will be * used when allocating memory, if the implementation supports it. If the - * ops param is NULL, then the created zpool will not be shrinkable. + * ops param is NULL, then the created zpool will not be evictable. * * Implementations must guarantee this to be thread-safe. * @@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; + zpool->evictable = driver->shrink && ops && ops->evict; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -199,7 +201,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, /** * zpool_destroy_pool() - Destroy a zpool - * @pool The zpool to destroy. + * @zpool: The zpool to destroy. * * Implementations must guarantee this to be thread-safe, * however only when destroying different pools. The same @@ -222,7 +224,7 @@ void zpool_destroy_pool(struct zpool *zpool) /** * zpool_get_type() - Get the type of the zpool - * @pool The zpool to check + * @zpool: The zpool to check * * This returns the type of the pool. * @@ -237,10 +239,10 @@ const char *zpool_get_type(struct zpool *zpool) /** * zpool_malloc() - Allocate memory - * @pool The zpool to allocate from. - * @size The amount of memory to allocate. - * @gfp The GFP flags to use when allocating memory. - * @handle Pointer to the handle to set + * @zpool: The zpool to allocate from. + * @size: The amount of memory to allocate. + * @gfp: The GFP flags to use when allocating memory. + * @handle: Pointer to the handle to set * * This allocates the requested amount of memory from the pool. * The gfp flags will be used when allocating memory, if the @@ -259,8 +261,8 @@ int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, /** * zpool_free() - Free previously allocated memory - * @pool The zpool that allocated the memory. - * @handle The handle to the memory to free. + * @zpool: The zpool that allocated the memory. + * @handle: The handle to the memory to free. * * This frees previously allocated memory. This does not guarantee * that the pool will actually free memory, only that the memory @@ -278,9 +280,9 @@ void zpool_free(struct zpool *zpool, unsigned long handle) /** * zpool_shrink() - Shrink the pool size - * @pool The zpool to shrink. - * @pages The number of pages to shrink the pool. - * @reclaimed The number of pages successfully evicted. + * @zpool: The zpool to shrink. + * @pages: The number of pages to shrink the pool. + * @reclaimed: The number of pages successfully evicted. * * This attempts to shrink the actual memory size of the pool * by evicting currently used handle(s). If the pool was @@ -296,16 +298,17 @@ void zpool_free(struct zpool *zpool, unsigned long handle) int zpool_shrink(struct zpool *zpool, unsigned int pages, unsigned int *reclaimed) { - return zpool->driver->shrink(zpool->pool, pages, reclaimed); + return zpool->driver->shrink ? + zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL; } /** * zpool_map_handle() - Map a previously allocated handle into memory - * @pool The zpool that the handle was allocated from - * @handle The handle to map - * @mm How the memory should be mapped + * @zpool: The zpool that the handle was allocated from + * @handle: The handle to map + * @mapmode: How the memory should be mapped * - * This maps a previously allocated handle into memory. The @mm + * This maps a previously allocated handle into memory. The @mapmode * param indicates to the implementation how the memory will be * used, i.e. read-only, write-only, read-write. If the * implementation does not support it, the memory will be treated @@ -329,8 +332,8 @@ void *zpool_map_handle(struct zpool *zpool, unsigned long handle, /** * zpool_unmap_handle() - Unmap a previously mapped handle - * @pool The zpool that the handle was allocated from - * @handle The handle to unmap + * @zpool: The zpool that the handle was allocated from + * @handle: The handle to unmap * * This unmaps a previously mapped handle. Any locks or other * actions that the implementation took in zpool_map_handle() @@ -344,7 +347,7 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) /** * zpool_get_total_size() - The total size of the pool - * @pool The zpool to check + * @zpool: The zpool to check * * This returns the total size in bytes of the pool. * @@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool) return zpool->driver->total_size(zpool->pool); } +/** + * zpool_evictable() - Test if zpool is potentially evictable + * @pool The zpool to test + * + * Zpool is only potentially evictable when it's created with struct + * zpool_ops.evict and its driver implements struct zpool_driver.shrink. + * + * However, it doesn't necessarily mean driver will use zpool_ops.evict + * in its implementation of zpool_driver.shrink. It could do internal + * defragmentation instead. + * + * Returns: true if potentially evictable; false otherwise. + */ +bool zpool_evictable(struct zpool *zpool) +{ + return zpool->evictable; +} + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 683c0651098c..c3013505c305 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -46,6 +46,7 @@ #include <linux/vmalloc.h> #include <linux/preempt.h> #include <linux/spinlock.h> +#include <linux/shrinker.h> #include <linux/types.h> #include <linux/debugfs.h> #include <linux/zsmalloc.h> @@ -257,11 +258,7 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; - /* - * To signify that register_shrinker() was successful - * and unregister_shrinker() will not Oops. - */ - bool shrinker_enabled; + #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -407,12 +404,6 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } -static int zs_zpool_shrink(void *pool, unsigned int pages, - unsigned int *reclaimed) -{ - return -EINVAL; -} - static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -450,7 +441,6 @@ static struct zpool_driver zs_zpool_driver = { .destroy = zs_zpool_destroy, .malloc = zs_zpool_malloc, .free = zs_zpool_free, - .shrink = zs_zpool_shrink, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, @@ -1057,7 +1047,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) * Reset OBJ_TAG_BITS bit to last link to tell * whether it's allocated object or not. */ - link->next = -1 << OBJ_TAG_BITS; + link->next = -1UL << OBJ_TAG_BITS; } kunmap_atomic(vaddr); page = next_page; @@ -2324,10 +2314,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, static void zs_unregister_shrinker(struct zs_pool *pool) { - if (pool->shrinker_enabled) { - unregister_shrinker(&pool->shrinker); - pool->shrinker_enabled = false; - } + unregister_shrinker(&pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) @@ -2426,11 +2413,13 @@ struct zs_pool *zs_create_pool(const char *name) goto err; /* - * Not critical, we still can use the pool - * and user can trigger compaction manually. + * Not critical since shrinker is only used to trigger internal + * defragmentation of the pool which is pretty optional thing. If + * registration fails we still can use the pool normally and user can + * trigger compaction manually. Thus, ignore return code. */ - if (zs_register_shrinker(pool) == 0) - pool->shrinker_enabled = true; + zs_register_shrinker(pool); + return pool; err: diff --git a/mm/zswap.c b/mm/zswap.c index d39581a076c3..c004aa4fd3f4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -49,6 +49,8 @@ static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); +/* The number of same-value filled pages currently stored in zswap */ +static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -116,6 +118,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* Enable/disable handling same-value filled pages (enabled by default) */ +static bool zswap_same_filled_pages_enabled = true; +module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, + bool, 0644); + /********************************* * data structures **********************************/ @@ -145,9 +152,10 @@ struct zswap_pool { * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression. For a same value filled page length is 0. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data + * value - value of the same-value filled pages which have same content */ struct zswap_entry { struct rb_node rbnode; @@ -155,7 +163,10 @@ struct zswap_entry { int refcount; unsigned int length; struct zswap_pool *pool; - unsigned long handle; + union { + unsigned long handle; + unsigned long value; + }; }; struct zswap_header { @@ -320,8 +331,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - zpool_free(entry->pool->zpool, entry->handle); - zswap_pool_put(entry->pool); + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); + } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); @@ -953,6 +968,28 @@ static int zswap_shrink(void) return ret; } +static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos] != page[0]) + return 0; + } + *value = page[0]; + return 1; +} + +static void zswap_fill_page(void *ptr, unsigned long value) +{ + unsigned long *page; + + page = (unsigned long *)ptr; + memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); +} + /********************************* * frontswap hooks **********************************/ @@ -964,11 +1001,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct zswap_entry *entry, *dupentry; struct crypto_comp *tfm; int ret; - unsigned int dlen = PAGE_SIZE, len; - unsigned long handle; + unsigned int hlen, dlen = PAGE_SIZE; + unsigned long handle, value; char *buf; u8 *src, *dst; - struct zswap_header *zhdr; + struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; if (!zswap_enabled || !tree) { ret = -ENODEV; @@ -993,6 +1030,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, goto reject; } + if (zswap_same_filled_pages_enabled) { + src = kmap_atomic(page); + if (zswap_is_page_same_filled(src, &value)) { + kunmap_atomic(src); + entry->offset = offset; + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto insert_entry; + } + kunmap_atomic(src); + } + /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) { @@ -1013,8 +1063,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* store */ - len = dlen + sizeof(struct zswap_header); - ret = zpool_malloc(entry->pool->zpool, len, + hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, &handle); if (ret == -ENOSPC) { @@ -1025,10 +1075,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto put_dstmem; } - zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); - zhdr->swpentry = swp_entry(type, offset); - buf = (u8 *)(zhdr + 1); - memcpy(buf, dst, dlen); + buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); + memcpy(buf, &zhdr, hlen); + memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); put_cpu_var(zswap_dstmem); @@ -1037,6 +1086,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, entry->handle = handle; entry->length = dlen; +insert_entry: /* map */ spin_lock(&tree->lock); do { @@ -1089,10 +1139,18 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, } spin_unlock(&tree->lock); + if (!entry->length) { + dst = kmap_atomic(page); + zswap_fill_page(dst, entry->value); + kunmap_atomic(dst); + goto freeentry; + } + /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, - ZPOOL_MM_RO) + sizeof(struct zswap_header); + src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); + if (zpool_evictable(entry->pool->zpool)) + src += sizeof(struct zswap_header); dst = kmap_atomic(page); tfm = *get_cpu_ptr(entry->pool->tfm); ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); @@ -1101,6 +1159,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); +freeentry: spin_lock(&tree->lock); zswap_entry_put(tree, entry); spin_unlock(&tree->lock); @@ -1209,6 +1268,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); + debugfs_create_atomic_t("same_filled_pages", 0444, + zswap_debugfs_root, &zswap_same_filled_pages); return 0; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 107b122c8969..1822335518ea 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -616,18 +616,15 @@ static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, return -EFAULT; memcpy(&to->base, &link_usettings.base, sizeof(to->base)); - bitmap_from_u32array(to->link_modes.supported, - __ETHTOOL_LINK_MODE_MASK_NBITS, - link_usettings.link_modes.supported, - __ETHTOOL_LINK_MODE_MASK_NU32); - bitmap_from_u32array(to->link_modes.advertising, - __ETHTOOL_LINK_MODE_MASK_NBITS, - link_usettings.link_modes.advertising, - __ETHTOOL_LINK_MODE_MASK_NU32); - bitmap_from_u32array(to->link_modes.lp_advertising, - __ETHTOOL_LINK_MODE_MASK_NBITS, - link_usettings.link_modes.lp_advertising, - __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_arr32(to->link_modes.supported, + link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_from_arr32(to->link_modes.advertising, + link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_from_arr32(to->link_modes.lp_advertising, + link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); return 0; } @@ -643,18 +640,15 @@ store_link_ksettings_for_user(void __user *to, struct ethtool_link_usettings link_usettings; memcpy(&link_usettings.base, &from->base, sizeof(link_usettings)); - bitmap_to_u32array(link_usettings.link_modes.supported, - __ETHTOOL_LINK_MODE_MASK_NU32, - from->link_modes.supported, - __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_to_u32array(link_usettings.link_modes.advertising, - __ETHTOOL_LINK_MODE_MASK_NU32, - from->link_modes.advertising, - __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_to_u32array(link_usettings.link_modes.lp_advertising, - __ETHTOOL_LINK_MODE_MASK_NU32, - from->link_modes.lp_advertising, - __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_arr32(link_usettings.link_modes.supported, + from->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_to_arr32(link_usettings.link_modes.advertising, + from->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_to_arr32(link_usettings.link_modes.lp_advertising, + from->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) return -EFAULT; @@ -2358,10 +2352,8 @@ static int ethtool_get_per_queue_coalesce(struct net_device *dev, useraddr += sizeof(*per_queue_opt); - bitmap_from_u32array(queue_mask, - MAX_NUM_QUEUE, - per_queue_opt->queue_mask, - DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, + MAX_NUM_QUEUE); for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; @@ -2393,10 +2385,7 @@ static int ethtool_set_per_queue_coalesce(struct net_device *dev, useraddr += sizeof(*per_queue_opt); - bitmap_from_u32array(queue_mask, - MAX_NUM_QUEUE, - per_queue_opt->queue_mask, - DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); if (!backup) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 6a5d0e39bb87..9b9d2ff01b35 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -65,10 +65,10 @@ #include <net/ip6_checksum.h> /* Ensure that we have struct in6_addr aligned on 32bit word. */ -static void *__mld2_query_bugs[] __attribute__((__unused__)) = { - BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4), - BUILD_BUG_ON_NULL(offsetof(struct mld2_report, mld2r_grec) % 4), - BUILD_BUG_ON_NULL(offsetof(struct mld2_grec, grec_mca) % 4) +static int __mld2_query_bugs[] __attribute__((__unused__)) = { + BUILD_BUG_ON_ZERO(offsetof(struct mld2_query, mld2q_srcs) % 4), + BUILD_BUG_ON_ZERO(offsetof(struct mld2_report, mld2r_grec) % 4), + BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4) }; static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 1ce7115aa499..3fb382c69ff6 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan @@ -10,10 +10,7 @@ KASAN_SHADOW_OFFSET ?= $(CONFIG_KASAN_SHADOW_OFFSET) CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address -CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \ - -fasan-shadow-offset=$(KASAN_SHADOW_OFFSET) \ - --param asan-stack=1 --param asan-globals=1 \ - --param asan-instrumentation-with-call-threshold=$(call_threshold)) +cc-param = $(call cc-option, -mllvm -$(1), $(call cc-option, --param $(1))) ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),) ifneq ($(CONFIG_COMPILE_TEST),y) @@ -21,14 +18,28 @@ ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),) -fsanitize=kernel-address is not supported by compiler) endif else - ifeq ($(CFLAGS_KASAN),) - ifneq ($(CONFIG_COMPILE_TEST),y) - $(warning CONFIG_KASAN: compiler does not support all options.\ - Trying minimal configuration) - endif - CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL) - endif + # -fasan-shadow-offset fails without -fsanitize + CFLAGS_KASAN_SHADOW := $(call cc-option, -fsanitize=kernel-address \ + -fasan-shadow-offset=$(KASAN_SHADOW_OFFSET), \ + $(call cc-option, -fsanitize=kernel-address \ + -mllvm -asan-mapping-offset=$(KASAN_SHADOW_OFFSET))) + + ifeq ($(strip $(CFLAGS_KASAN_SHADOW)),) + CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL) + else + # Now add all the compiler specific options that are valid standalone + CFLAGS_KASAN := $(CFLAGS_KASAN_SHADOW) \ + $(call cc-param,asan-globals=1) \ + $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ + $(call cc-param,asan-stack=1) \ + $(call cc-param,asan-use-after-scope=1) \ + $(call cc-param,asan-instrument-allocas=1) + endif + endif CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope) + +CFLAGS_KASAN_NOSANITIZE := -fno-builtin + endif diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 5fdc1a19b02c..5589bae34af6 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -121,7 +121,7 @@ endif ifeq ($(CONFIG_KASAN),y) _c_flags += $(if $(patsubst n%,, \ $(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \ - $(CFLAGS_KASAN)) + $(CFLAGS_KASAN), $(CFLAGS_KASAN_NOSANITIZE)) endif ifeq ($(CONFIG_UBSAN),y) diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 8fd4d44fbcd1..b593b36ccff8 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -7,7 +7,6 @@ ifdef CONFIG_UBSAN CFLAGS_UBSAN += $(call cc-option, -fsanitize=signed-integer-overflow) CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds) CFLAGS_UBSAN += $(call cc-option, -fsanitize=object-size) - CFLAGS_UBSAN += $(call cc-option, -fsanitize=returns-nonnull-attribute) CFLAGS_UBSAN += $(call cc-option, -fsanitize=bool) CFLAGS_UBSAN += $(call cc-option, -fsanitize=enum) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e954df2b2077..78e7a310af46 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -566,6 +566,7 @@ foreach my $entry (@mode_permission_funcs) { $mode_perms_search .= '|' if ($mode_perms_search ne ""); $mode_perms_search .= $entry->[0]; } +$mode_perms_search = "(?:${mode_perms_search})"; our $mode_perms_world_writable = qr{ S_IWUGO | @@ -600,6 +601,37 @@ foreach my $entry (keys %mode_permission_string_types) { $mode_perms_string_search .= '|' if ($mode_perms_string_search ne ""); $mode_perms_string_search .= $entry; } +our $single_mode_perms_string_search = "(?:${mode_perms_string_search})"; +our $multi_mode_perms_string_search = qr{ + ${single_mode_perms_string_search} + (?:\s*\|\s*${single_mode_perms_string_search})* +}x; + +sub perms_to_octal { + my ($string) = @_; + + return trim($string) if ($string =~ /^\s*0[0-7]{3,3}\s*$/); + + my $val = ""; + my $oval = ""; + my $to = 0; + my $curpos = 0; + my $lastpos = 0; + while ($string =~ /\b(($single_mode_perms_string_search)\b(?:\s*\|\s*)?\s*)/g) { + $curpos = pos($string); + my $match = $2; + my $omatch = $1; + last if ($lastpos > 0 && ($curpos - length($omatch) != $lastpos)); + $lastpos = $curpos; + $to |= $mode_permission_string_types{$match}; + $val .= '\s*\|\s*' if ($val ne ""); + $val .= $match; + $oval .= $omatch; + } + $oval =~ s/^\s*\|\s*//; + $oval =~ s/\s*\|\s*$//; + return sprintf("%04o", $to); +} our $allowed_asm_includes = qr{(?x: irq| @@ -2875,6 +2907,7 @@ sub process { # logging functions like pr_info that end in a string # lines with a single string # #defines that are a single string +# lines with an RFC3986 like URL # # There are 3 different line length message types: # LONG_LINE_COMMENT a comment starts before but extends beyond $max_line_length @@ -2906,6 +2939,10 @@ sub process { $line =~ /^\+\s*(?:\w+)?\s*DEFINE_PER_CPU/) { $msg_type = ""; + # URL ($rawline is used in case the URL is in a comment) + } elsif ($rawline =~ /^\+.*\b[a-z][\w\.\+\-]*:\/\/\S+/i) { + $msg_type = ""; + # Otherwise set the alternate message types # a comment starts before $max_line_length @@ -2983,7 +3020,7 @@ sub process { # check indentation starts on a tab stop if ($^V && $^V ge 5.10.0 && - $sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$))/) { + $sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$)|$Declare\s*$Ident\s*[;=])/) { my $indent = length($1); if ($indent % 8) { if (WARN("TABSTOP", @@ -4489,7 +4526,9 @@ sub process { } # check for unnecessary parentheses around comparisons in if uses - if ($^V && $^V ge 5.10.0 && defined($stat) && +# when !drivers/staging or command-line uses --strict + if (($realfile !~ m@^(?:drivers/staging/)@ || $check_orig) && + $^V && $^V ge 5.10.0 && defined($stat) && $stat =~ /(^.\s*if\s*($balanced_parens))/) { my $if_stat = $1; my $test = substr($2, 1, -1); @@ -5307,7 +5346,7 @@ sub process { } # check for line continuations in quoted strings with odd counts of " - if ($rawline =~ /\\$/ && $rawline =~ tr/"/"/ % 2) { + if ($rawline =~ /\\$/ && $sline =~ tr/"/"/ % 2) { WARN("LINE_CONTINUATIONS", "Avoid line continuations in quoted strings\n" . $herecurr); } @@ -6269,8 +6308,69 @@ sub process { "Exporting world writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr); } +# check for DEVICE_ATTR uses that could be DEVICE_ATTR_<FOO> +# and whether or not function naming is typical and if +# DEVICE_ATTR permissions uses are unusual too + if ($^V && $^V ge 5.10.0 && + defined $stat && + $stat =~ /\bDEVICE_ATTR\s*\(\s*(\w+)\s*,\s*\(?\s*(\s*(?:${multi_mode_perms_string_search}|0[0-7]{3,3})\s*)\s*\)?\s*,\s*(\w+)\s*,\s*(\w+)\s*\)/) { + my $var = $1; + my $perms = $2; + my $show = $3; + my $store = $4; + my $octal_perms = perms_to_octal($perms); + if ($show =~ /^${var}_show$/ && + $store =~ /^${var}_store$/ && + $octal_perms eq "0644") { + if (WARN("DEVICE_ATTR_RW", + "Use DEVICE_ATTR_RW\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\bDEVICE_ATTR\s*\(\s*$var\s*,\s*\Q$perms\E\s*,\s*$show\s*,\s*$store\s*\)/DEVICE_ATTR_RW(${var})/; + } + } elsif ($show =~ /^${var}_show$/ && + $store =~ /^NULL$/ && + $octal_perms eq "0444") { + if (WARN("DEVICE_ATTR_RO", + "Use DEVICE_ATTR_RO\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\bDEVICE_ATTR\s*\(\s*$var\s*,\s*\Q$perms\E\s*,\s*$show\s*,\s*NULL\s*\)/DEVICE_ATTR_RO(${var})/; + } + } elsif ($show =~ /^NULL$/ && + $store =~ /^${var}_store$/ && + $octal_perms eq "0200") { + if (WARN("DEVICE_ATTR_WO", + "Use DEVICE_ATTR_WO\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\bDEVICE_ATTR\s*\(\s*$var\s*,\s*\Q$perms\E\s*,\s*NULL\s*,\s*$store\s*\)/DEVICE_ATTR_WO(${var})/; + } + } elsif ($octal_perms eq "0644" || + $octal_perms eq "0444" || + $octal_perms eq "0200") { + my $newshow = "$show"; + $newshow = "${var}_show" if ($show ne "NULL" && $show ne "${var}_show"); + my $newstore = $store; + $newstore = "${var}_store" if ($store ne "NULL" && $store ne "${var}_store"); + my $rename = ""; + if ($show ne $newshow) { + $rename .= " '$show' to '$newshow'"; + } + if ($store ne $newstore) { + $rename .= " '$store' to '$newstore'"; + } + WARN("DEVICE_ATTR_FUNCTIONS", + "Consider renaming function(s)$rename\n" . $herecurr); + } else { + WARN("DEVICE_ATTR_PERMS", + "DEVICE_ATTR unusual permissions '$perms' used\n" . $herecurr); + } + } + # Mode permission misuses where it seems decimal should be octal # This uses a shortcut match to avoid unnecessary uses of a slow foreach loop +# o Ignore module_param*(...) uses with a decimal 0 permission as that has a +# specific definition of not visible in sysfs. +# o Ignore proc_create*(...) uses with a decimal 0 permission as that means +# use the default permissions if ($^V && $^V ge 5.10.0 && defined $stat && $line =~ /$mode_perms_search/) { @@ -6294,8 +6394,9 @@ sub process { if ($stat =~ /$test/) { my $val = $1; $val = $6 if ($skip_args ne ""); - if (($val =~ /^$Int$/ && $val !~ /^$Octal$/) || - ($val =~ /^$Octal$/ && length($val) ne 4)) { + if (!($func =~ /^(?:module_param|proc_create)/ && $val eq "0") && + (($val =~ /^$Int$/ && $val !~ /^$Octal$/) || + ($val =~ /^$Octal$/ && length($val) ne 4))) { ERROR("NON_OCTAL_PERMISSIONS", "Use 4 digit octal (0777) not decimal permissions\n" . "$here\n" . $stat_real); } @@ -6308,30 +6409,13 @@ sub process { } # check for uses of S_<PERMS> that could be octal for readability - if ($line =~ /\b$mode_perms_string_search\b/) { - my $val = ""; - my $oval = ""; - my $to = 0; - my $curpos = 0; - my $lastpos = 0; - while ($line =~ /\b(($mode_perms_string_search)\b(?:\s*\|\s*)?\s*)/g) { - $curpos = pos($line); - my $match = $2; - my $omatch = $1; - last if ($lastpos > 0 && ($curpos - length($omatch) != $lastpos)); - $lastpos = $curpos; - $to |= $mode_permission_string_types{$match}; - $val .= '\s*\|\s*' if ($val ne ""); - $val .= $match; - $oval .= $omatch; - } - $oval =~ s/^\s*\|\s*//; - $oval =~ s/\s*\|\s*$//; - my $octal = sprintf("%04o", $to); + if ($line =~ /\b($multi_mode_perms_string_search)\b/) { + my $oval = $1; + my $octal = perms_to_octal($oval); if (WARN("SYMBOLIC_PERMS", "Symbolic permissions '$oval' are not preferred. Consider using octal permissions '$octal'.\n" . $herecurr) && $fix) { - $fixed[$fixlinenr] =~ s/$val/$octal/; + $fixed[$fixlinenr] =~ s/\Q$oval\E/$octal/; } } diff --git a/scripts/decodecode b/scripts/decodecode index 5ea071099330..9cef558528aa 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -21,12 +21,24 @@ trap cleanup EXIT T=`mktemp` || die "cannot create temp file" code= +cont= while read i ; do case "$i" in *Code:*) code=$i + cont=yes + ;; +*) + [ -n "$cont" ] && { + xdump="$(echo $i | grep '^[[:xdigit:]<>[:space:]]\+$')" + if [ -n "$xdump" ]; then + code="$code $xdump" + else + cont= + fi + } ;; esac diff --git a/scripts/tags.sh b/scripts/tags.sh index d23dcbf17457..78e546ff689c 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -77,7 +77,7 @@ find_include_sources() find_other_sources() { find ${tree}* $ignore \ - \( -name include -o -name arch -o -name '.tmp_*' \) -prune -o \ + \( -path ${tree}include -o -path ${tree}arch -o -name '.tmp_*' \) -prune -o \ -name "$1" -not -type l -print; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 9a65eeaf7dfa..6631fda41c03 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -688,6 +688,7 @@ static void apparmor_bprm_committing_creds(struct linux_binprm *bprm) aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_ctx->label); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 8644d864e3c1..35ef1e9045e8 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2564,6 +2564,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) /* Always clear parent death signal on SID transitions. */ current->pdeath_signal = 0; + current->signal->pdeath_signal_proc = 0; /* Check whether the new SID can inherit resource limits from the old * SID. If not, reset all soft limits to the lower of the current diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 8298e094f4f7..ffda91a4a1aa 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -250,15 +250,10 @@ int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, } else { struct task_struct *tracer; - rcu_read_lock(); - tracer = find_task_by_vpid(arg2); - if (tracer) - get_task_struct(tracer); - else + tracer = find_get_task_by_vpid(arg2); + if (!tracer) { rc = -EINVAL; - rcu_read_unlock(); - - if (tracer) { + } else { rc = yama_ptracer_add(tracer, myself); put_task_struct(tracer); } diff --git a/tools/cgroup/Makefile b/tools/cgroup/Makefile index 860fa151640a..ffca068e4a76 100644 --- a/tools/cgroup/Makefile +++ b/tools/cgroup/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for cgroup tools -CC = $(CROSS_COMPILE)gcc CFLAGS = -Wall -Wextra all: cgroup_event_listener diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile index 805a2c0cf4cd..240eda014b37 100644 --- a/tools/gpio/Makefile +++ b/tools/gpio/Makefile @@ -12,8 +12,6 @@ endif # (this improves performance and avoids hard-to-debug behaviour); MAKEFLAGS += -r -CC = $(CROSS_COMPILE)gcc -LD = $(CROSS_COMPILE)ld CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include ALL_TARGETS := lsgpio gpio-hammer gpio-event-mon diff --git a/tools/hv/Makefile b/tools/hv/Makefile index 1139d71fa0cf..5db5e62cebda 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Hyper-V tools -CC = $(CROSS_COMPILE)gcc WARNINGS = -Wall -Wextra CFLAGS = $(WARNINGS) -g $(shell getconf LFS_CFLAGS) diff --git a/tools/iio/Makefile b/tools/iio/Makefile index a08e7a47d6a3..332ed2f6c2c2 100644 --- a/tools/iio/Makefile +++ b/tools/iio/Makefile @@ -12,8 +12,6 @@ endif # (this improves performance and avoids hard-to-debug behaviour); MAKEFLAGS += -r -CC = $(CROSS_COMPILE)gcc -LD = $(CROSS_COMPILE)ld CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include ALL_TARGETS := iio_event_monitor lsiio iio_generic_buffer diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/asm-generic/bitops/find.h index 9311fadaaab2..16ed1982cb34 100644 --- a/tools/include/asm-generic/bitops/find.h +++ b/tools/include/asm-generic/bitops/find.h @@ -16,6 +16,22 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset); #endif +#ifndef find_next_and_bit +/** + * find_next_and_bit - find the next set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +extern unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset); +#endif + #ifndef find_next_zero_bit /** diff --git a/tools/laptop/freefall/Makefile b/tools/laptop/freefall/Makefile index 5f758c489a20..b572d94255f6 100644 --- a/tools/laptop/freefall/Makefile +++ b/tools/laptop/freefall/Makefile @@ -2,7 +2,6 @@ PREFIX ?= /usr SBINDIR ?= sbin INSTALL ?= install -CC = $(CROSS_COMPILE)gcc TARGET = freefall diff --git a/tools/leds/Makefile b/tools/leds/Makefile index c379af003807..7b6bed13daaa 100644 --- a/tools/leds/Makefile +++ b/tools/leds/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for LEDs tools -CC = $(CROSS_COMPILE)gcc CFLAGS = -Wall -Wextra -g -I../../include/uapi all: uledmon led_hw_brightness_mon diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index 42c15f906aac..9474f5856d4e 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -22,22 +22,29 @@ #include <linux/bitmap.h> #include <linux/kernel.h> -#if !defined(find_next_bit) +#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ + !defined(find_next_and_bit) /* - * This is a common helper function for find_next_bit and - * find_next_zero_bit. The difference is the "invert" argument, which - * is XORed with each fetched word before searching it for one bits. + * This is a common helper function for find_next_bit, find_next_zero_bit, and + * find_next_and_bit. The differences are: + * - The "invert" argument, which is XORed with each fetched word before + * searching it for one bits. + * - The optional "addr2", which is anded with "addr1" if present. */ -static unsigned long _find_next_bit(const unsigned long *addr, - unsigned long nbits, unsigned long start, unsigned long invert) +static inline unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert) { unsigned long tmp; if (unlikely(start >= nbits)) return nbits; - tmp = addr[start / BITS_PER_LONG] ^ invert; + tmp = addr1[start / BITS_PER_LONG]; + if (addr2) + tmp &= addr2[start / BITS_PER_LONG]; + tmp ^= invert; /* Handle 1st word. */ tmp &= BITMAP_FIRST_WORD_MASK(start); @@ -48,7 +55,10 @@ static unsigned long _find_next_bit(const unsigned long *addr, if (start >= nbits) return nbits; - tmp = addr[start / BITS_PER_LONG] ^ invert; + tmp = addr1[start / BITS_PER_LONG]; + if (addr2) + tmp &= addr2[start / BITS_PER_LONG]; + tmp ^= invert; } return min(start + __ffs(tmp), nbits); @@ -62,7 +72,7 @@ static unsigned long _find_next_bit(const unsigned long *addr, unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - return _find_next_bit(addr, size, offset, 0UL); + return _find_next_bit(addr, NULL, size, offset, 0UL); } #endif @@ -104,6 +114,16 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - return _find_next_bit(addr, size, offset, ~0UL); + return _find_next_bit(addr, NULL, size, offset, ~0UL); } #endif + +#ifndef find_next_and_bit +unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr1, addr2, size, offset, 0UL); +} +#endif + diff --git a/tools/lib/subcmd/pager.c b/tools/lib/subcmd/pager.c index 5ba754d17952..9997a8805a82 100644 --- a/tools/lib/subcmd/pager.c +++ b/tools/lib/subcmd/pager.c @@ -30,10 +30,13 @@ static void pager_preexec(void) * have real input */ fd_set in; + fd_set exception; FD_ZERO(&in); + FD_ZERO(&exception); FD_SET(0, &in); - select(1, &in, NULL, &in, NULL); + FD_SET(0, &exception); + select(1, &in, NULL, &exception, NULL); setenv("LESS", "FRSX", 0); } diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 9b0351d3ce34..012328038594 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -146,12 +146,6 @@ define allow-override $(eval $(1) = $(2))) endef -# Allow setting CC and AR and LD, or setting CROSS_COMPILE as a prefix. -$(call allow-override,CC,$(CROSS_COMPILE)gcc) -$(call allow-override,AR,$(CROSS_COMPILE)ar) -$(call allow-override,LD,$(CROSS_COMPILE)ld) -$(call allow-override,CXX,$(CROSS_COMPILE)g++) - LD += $(EXTRA_LDFLAGS) HOSTCC ?= gcc diff --git a/tools/power/acpi/Makefile.config b/tools/power/acpi/Makefile.config index a1883bbb0144..2cccbba64418 100644 --- a/tools/power/acpi/Makefile.config +++ b/tools/power/acpi/Makefile.config @@ -56,9 +56,6 @@ INSTALL_SCRIPT = ${INSTALL_PROGRAM} # to compile vs uClibc, that can be done here as well. CROSS = #/usr/i386-linux-uclibc/usr/bin/i386-uclibc- CROSS_COMPILE ?= $(CROSS) -CC = $(CROSS_COMPILE)gcc -LD = $(CROSS_COMPILE)gcc -STRIP = $(CROSS_COMPILE)strip HOSTCC = gcc # check if compiler option is supported diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index fcb3ed0be5f8..dd614463d4d6 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -42,6 +42,24 @@ EXTRA_WARNINGS += -Wformat CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) +# Makefiles suck: This macro sets a default value of $(2) for the +# variable named by $(1), unless the variable has been set by +# environment or command line. This is necessary for CC and AR +# because make sets default values, so the simpler ?= approach +# won't work as expected. +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +# Allow setting various cross-compile vars or setting CROSS_COMPILE as a prefix. +$(call allow-override,CC,$(CROSS_COMPILE)gcc) +$(call allow-override,AR,$(CROSS_COMPILE)ar) +$(call allow-override,LD,$(CROSS_COMPILE)ld) +$(call allow-override,CXX,$(CROSS_COMPILE)g++) +$(call allow-override,STRIP,$(CROSS_COMPILE)strip) + ifeq ($(CC_NO_CLANG), 1) EXTRA_WARNINGS += -Wstrict-aliasing=3 endif diff --git a/tools/spi/Makefile b/tools/spi/Makefile index 90615e10c79a..815d15589177 100644 --- a/tools/spi/Makefile +++ b/tools/spi/Makefile @@ -11,8 +11,6 @@ endif # (this improves performance and avoids hard-to-debug behaviour); MAKEFLAGS += -r -CC = $(CROSS_COMPILE)gcc -LD = $(CROSS_COMPILE)ld CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include ALL_TARGETS := spidev_test spidev_fdx diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile index 3926a0409dda..a5276a91dfbf 100644 --- a/tools/testing/selftests/memfd/Makefile +++ b/tools/testing/selftests/memfd/Makefile @@ -12,3 +12,8 @@ fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags) include ../lib.mk $(OUTPUT)/fuse_mnt: LDLIBS += $(shell pkg-config fuse --libs) + +$(OUTPUT)/memfd_test: memfd_test.c common.o +$(OUTPUT)/fuse_test: fuse_test.c common.o + +EXTRA_CLEAN = common.o diff --git a/tools/testing/selftests/memfd/common.c b/tools/testing/selftests/memfd/common.c new file mode 100644 index 000000000000..8eb3d75f6e60 --- /dev/null +++ b/tools/testing/selftests/memfd/common.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include <stdio.h> +#include <stdlib.h> +#include <linux/fcntl.h> +#include <linux/memfd.h> +#include <unistd.h> +#include <sys/syscall.h> + +#include "common.h" + +int hugetlbfs_test = 0; + +/* + * Copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +int sys_memfd_create(const char *name, unsigned int flags) +{ + if (hugetlbfs_test) + flags |= MFD_HUGETLB; + + return syscall(__NR_memfd_create, name, flags); +} diff --git a/tools/testing/selftests/memfd/common.h b/tools/testing/selftests/memfd/common.h new file mode 100644 index 000000000000..522d2c630bd8 --- /dev/null +++ b/tools/testing/selftests/memfd/common.h @@ -0,0 +1,9 @@ +#ifndef COMMON_H_ +#define COMMON_H_ + +extern int hugetlbfs_test; + +unsigned long default_huge_page_size(void); +int sys_memfd_create(const char *name, unsigned int flags); + +#endif diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index 1ccb7a3eb14b..b018e835737d 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -33,14 +33,12 @@ #include <sys/wait.h> #include <unistd.h> +#include "common.h" + #define MFD_DEF_SIZE 8192 #define STACK_SIZE 65536 -static int sys_memfd_create(const char *name, - unsigned int flags) -{ - return syscall(__NR_memfd_create, name, flags); -} +static size_t mfd_def_size = MFD_DEF_SIZE; static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { @@ -127,7 +125,7 @@ static void *mfd_assert_mmap_shared(int fd) void *p; p = mmap(NULL, - MFD_DEF_SIZE, + mfd_def_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, @@ -145,7 +143,7 @@ static void *mfd_assert_mmap_private(int fd) void *p; p = mmap(NULL, - MFD_DEF_SIZE, + mfd_def_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, @@ -178,7 +176,7 @@ static int sealing_thread_fn(void *arg) usleep(200000); /* unmount mapping before sealing to avoid i_mmap_writable failures */ - munmap(global_p, MFD_DEF_SIZE); + munmap(global_p, mfd_def_size); /* Try sealing the global file; expect EBUSY or success. Current * kernels will never succeed, but in the future, kernels might @@ -228,7 +226,7 @@ static void join_sealing_thread(pid_t pid) int main(int argc, char **argv) { - static const char zero[MFD_DEF_SIZE]; + char *zero; int fd, mfd, r; void *p; int was_sealed; @@ -239,6 +237,25 @@ int main(int argc, char **argv) abort(); } + if (argc >= 3) { + if (!strcmp(argv[2], "hugetlbfs")) { + unsigned long hpage_size = default_huge_page_size(); + + if (!hpage_size) { + printf("Unable to determine huge page size\n"); + abort(); + } + + hugetlbfs_test = 1; + mfd_def_size = hpage_size * 2; + } else { + printf("Unknown option: %s\n", argv[2]); + abort(); + } + } + + zero = calloc(sizeof(*zero), mfd_def_size); + /* open FUSE memfd file for GUP testing */ printf("opening: %s\n", argv[1]); fd = open(argv[1], O_RDONLY | O_CLOEXEC); @@ -249,7 +266,7 @@ int main(int argc, char **argv) /* create new memfd-object */ mfd = mfd_assert_new("kern_memfd_fuse", - MFD_DEF_SIZE, + mfd_def_size, MFD_CLOEXEC | MFD_ALLOW_SEALING); /* mmap memfd-object for writing */ @@ -268,7 +285,7 @@ int main(int argc, char **argv) * This guarantees that the receive-buffer is pinned for 1s until the * data is written into it. The racing ADD_SEALS should thus fail as * the pages are still pinned. */ - r = read(fd, p, MFD_DEF_SIZE); + r = read(fd, p, mfd_def_size); if (r < 0) { printf("read() failed: %m\n"); abort(); @@ -295,10 +312,10 @@ int main(int argc, char **argv) * enough to avoid any in-flight writes. */ p = mfd_assert_mmap_private(mfd); - if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) { + if (was_sealed && memcmp(p, zero, mfd_def_size)) { printf("memfd sealed during read() but data not discarded\n"); abort(); - } else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) { + } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) { printf("memfd sealed after read() but data discarded\n"); abort(); } @@ -307,6 +324,7 @@ int main(int argc, char **argv) close(fd); printf("fuse: DONE\n"); + free(zero); return 0; } diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 132a54f74e88..10baa1652fc2 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -19,7 +19,10 @@ #include <sys/wait.h> #include <unistd.h> +#include "common.h" + #define MEMFD_STR "memfd:" +#define MEMFD_HUGE_STR "memfd-hugetlb:" #define SHARED_FT_STR "(shared file-table)" #define MFD_DEF_SIZE 8192 @@ -28,41 +31,8 @@ /* * Default is not to test hugetlbfs */ -static int hugetlbfs_test; static size_t mfd_def_size = MFD_DEF_SIZE; - -/* - * Copied from mlock2-tests.c - */ -static unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -static int sys_memfd_create(const char *name, - unsigned int flags) -{ - if (hugetlbfs_test) - flags |= MFD_HUGETLB; - - return syscall(__NR_memfd_create, name, flags); -} +static const char *memfd_str = MEMFD_STR; static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { @@ -513,6 +483,10 @@ static void mfd_assert_grow_write(int fd) static char *buf; ssize_t l; + /* hugetlbfs does not support write */ + if (hugetlbfs_test) + return; + buf = malloc(mfd_def_size * 8); if (!buf) { printf("malloc(%zu) failed: %m\n", mfd_def_size * 8); @@ -533,6 +507,10 @@ static void mfd_fail_grow_write(int fd) static char *buf; ssize_t l; + /* hugetlbfs does not support write */ + if (hugetlbfs_test) + return; + buf = malloc(mfd_def_size * 8); if (!buf) { printf("malloc(%zu) failed: %m\n", mfd_def_size * 8); @@ -598,7 +576,7 @@ static void test_create(void) char buf[2048]; int fd; - printf("%s CREATE\n", MEMFD_STR); + printf("%s CREATE\n", memfd_str); /* test NULL name */ mfd_fail_new(NULL, 0); @@ -627,18 +605,13 @@ static void test_create(void) fd = mfd_assert_new("", 0, MFD_CLOEXEC); close(fd); - if (!hugetlbfs_test) { - /* verify MFD_ALLOW_SEALING is allowed */ - fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); - close(fd); - - /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ - fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); - close(fd); - } else { - /* sealing is not supported on hugetlbfs */ - mfd_fail_new("", MFD_ALLOW_SEALING); - } + /* verify MFD_ALLOW_SEALING is allowed */ + fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); + close(fd); + + /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ + fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); + close(fd); } /* @@ -649,11 +622,7 @@ static void test_basic(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s BASIC\n", MEMFD_STR); + printf("%s BASIC\n", memfd_str); fd = mfd_assert_new("kern_memfd_basic", mfd_def_size, @@ -698,28 +667,6 @@ static void test_basic(void) } /* - * hugetlbfs doesn't support seals or write, so just verify grow and shrink - * on a hugetlbfs file created via memfd_create. - */ -static void test_hugetlbfs_grow_shrink(void) -{ - int fd; - - printf("%s HUGETLBFS-GROW-SHRINK\n", MEMFD_STR); - - fd = mfd_assert_new("kern_memfd_seal_write", - mfd_def_size, - MFD_CLOEXEC); - - mfd_assert_read(fd); - mfd_assert_write(fd); - mfd_assert_shrink(fd); - mfd_assert_grow(fd); - - close(fd); -} - -/* * Test SEAL_WRITE * Test whether SEAL_WRITE actually prevents modifications. */ @@ -727,14 +674,7 @@ static void test_seal_write(void) { int fd; - /* - * hugetlbfs does not contain sealing or write support. Just test - * basic grow and shrink via test_hugetlbfs_grow_shrink. - */ - if (hugetlbfs_test) - return test_hugetlbfs_grow_shrink(); - - printf("%s SEAL-WRITE\n", MEMFD_STR); + printf("%s SEAL-WRITE\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_write", mfd_def_size, @@ -760,11 +700,7 @@ static void test_seal_shrink(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s SEAL-SHRINK\n", MEMFD_STR); + printf("%s SEAL-SHRINK\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_shrink", mfd_def_size, @@ -790,11 +726,7 @@ static void test_seal_grow(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s SEAL-GROW\n", MEMFD_STR); + printf("%s SEAL-GROW\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_grow", mfd_def_size, @@ -820,11 +752,7 @@ static void test_seal_resize(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s SEAL-RESIZE\n", MEMFD_STR); + printf("%s SEAL-RESIZE\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_resize", mfd_def_size, @@ -843,32 +771,6 @@ static void test_seal_resize(void) } /* - * hugetlbfs does not support seals. Basic test to dup the memfd created - * fd and perform some basic operations on it. - */ -static void hugetlbfs_dup(char *b_suffix) -{ - int fd, fd2; - - printf("%s HUGETLBFS-DUP %s\n", MEMFD_STR, b_suffix); - - fd = mfd_assert_new("kern_memfd_share_dup", - mfd_def_size, - MFD_CLOEXEC); - - fd2 = mfd_assert_dup(fd); - - mfd_assert_read(fd); - mfd_assert_write(fd); - - mfd_assert_shrink(fd2); - mfd_assert_grow(fd2); - - close(fd2); - close(fd); -} - -/* * Test sharing via dup() * Test that seals are shared between dupped FDs and they're all equal. */ @@ -876,16 +778,7 @@ static void test_share_dup(char *banner, char *b_suffix) { int fd, fd2; - /* - * hugetlbfs does not contain sealing support. Perform some - * basic testing on dup'ed fd instead via hugetlbfs_dup. - */ - if (hugetlbfs_test) { - hugetlbfs_dup(b_suffix); - return; - } - - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_dup", mfd_def_size, @@ -927,11 +820,7 @@ static void test_share_mmap(char *banner, char *b_suffix) int fd; void *p; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_mmap", mfd_def_size, @@ -956,32 +845,6 @@ static void test_share_mmap(char *banner, char *b_suffix) } /* - * Basic test to make sure we can open the hugetlbfs fd via /proc and - * perform some simple operations on it. - */ -static void hugetlbfs_proc_open(char *b_suffix) -{ - int fd, fd2; - - printf("%s HUGETLBFS-PROC-OPEN %s\n", MEMFD_STR, b_suffix); - - fd = mfd_assert_new("kern_memfd_share_open", - mfd_def_size, - MFD_CLOEXEC); - - fd2 = mfd_assert_open(fd, O_RDWR, 0); - - mfd_assert_read(fd); - mfd_assert_write(fd); - - mfd_assert_shrink(fd2); - mfd_assert_grow(fd2); - - close(fd2); - close(fd); -} - -/* * Test sealing with open(/proc/self/fd/%d) * Via /proc we can get access to a separate file-context for the same memfd. * This is *not* like dup(), but like a real separate open(). Make sure the @@ -991,16 +854,7 @@ static void test_share_open(char *banner, char *b_suffix) { int fd, fd2; - /* - * hugetlbfs does not contain sealing support. So test basic - * functionality of using /proc fd via hugetlbfs_proc_open - */ - if (hugetlbfs_test) { - hugetlbfs_proc_open(b_suffix); - return; - } - - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_open", mfd_def_size, @@ -1043,11 +897,7 @@ static void test_share_fork(char *banner, char *b_suffix) int fd; pid_t pid; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_fork", mfd_def_size, @@ -1083,7 +933,11 @@ int main(int argc, char **argv) } hugetlbfs_test = 1; + memfd_str = MEMFD_HUGE_STR; mfd_def_size = hpage_size * 2; + } else { + printf("Unknown option: %s\n", argv[1]); + abort(); } } diff --git a/tools/testing/selftests/memfd/run_fuse_test.sh b/tools/testing/selftests/memfd/run_fuse_test.sh index 407df68dfe27..22e572e2d66a 100755 --- a/tools/testing/selftests/memfd/run_fuse_test.sh +++ b/tools/testing/selftests/memfd/run_fuse_test.sh @@ -10,6 +10,6 @@ set -e mkdir mnt ./fuse_mnt ./mnt -./fuse_test ./mnt/memfd +./fuse_test ./mnt/memfd $@ fusermount -u ./mnt rmdir ./mnt diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh index daabb350697c..c2d41ed81b24 100755 --- a/tools/testing/selftests/memfd/run_tests.sh +++ b/tools/testing/selftests/memfd/run_tests.sh @@ -60,6 +60,7 @@ fi # Run the hugetlbfs test # ./memfd_test hugetlbfs +./run_fuse_test.sh hugetlbfs # # Give back any huge pages allocated for the test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 7f45806bd863..fdefa2295ddc 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -8,17 +8,18 @@ endif CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) LDLIBS = -lrt TEST_GEN_FILES = compaction_test +TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd -TEST_GEN_FILES += mlock-random-test +TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range -TEST_GEN_FILES += gup_benchmark TEST_PROGS := run_vmtests diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index cc826326de87..d2561895a021 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -177,4 +177,15 @@ else echo "[PASS]" fi +echo "-----------------------------" +echo "running virtual address 128TB switch test" +echo "-----------------------------" +./va_128TBswitch +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + exit $exitcode diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c new file mode 100644 index 000000000000..e7fe734c374f --- /dev/null +++ b/tools/testing/selftests/vm/va_128TBswitch.c @@ -0,0 +1,297 @@ +/* + * + * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> + * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <stdio.h> +#include <sys/mman.h> +#include <string.h> + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +#ifdef __powerpc64__ +#define PAGE_SIZE (64 << 10) +/* + * This will work with 16M and 2M hugepage size + */ +#define HUGETLB_SIZE (16 << 20) +#else +#define PAGE_SIZE (4 << 10) +#define HUGETLB_SIZE (2 << 20) +#endif + +/* + * >= 128TB is the hint addr value we used to select + * large address space. + */ +#define ADDR_SWITCH_HINT (1UL << 47) +#define LOW_ADDR ((void *) (1UL << 30)) +#define HIGH_ADDR ((void *) (1UL << 48)) + +struct testcase { + void *addr; + unsigned long size; + unsigned long flags; + const char *msg; + unsigned int low_addr_required:1; + unsigned int keep_mapped:1; +}; + +static struct testcase testcases[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + /* + * We should never allocate at the requested address or above it + * The len cross the 128TB boundary. Without MAP_FIXED + * we will always search in the lower address space. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at 128TB, the area is free we should get that + * even without MAP_FIXED. + */ + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, +}; + +static struct testcase hugetlb_testcases[] = { + { + .addr = NULL, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", + }, +}; + +static int run_test(struct testcase *test, int count) +{ + void *p; + int i, ret = 0; + + for (i = 0; i < count; i++) { + struct testcase *t = test + i; + + p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); + + printf("%s: %p - ", t->msg, p); + + if (p == MAP_FAILED) { + printf("FAILED\n"); + ret = 1; + continue; + } + + if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + printf("FAILED\n"); + ret = 1; + } else { + /* + * Do a dereference of the address returned so that we catch + * bugs in page fault handling + */ + memset(p, 0, t->size); + printf("OK\n"); + } + if (!t->keep_mapped) + munmap(p, t->size); + } + + return ret; +} + +static int supported_arch(void) +{ +#if defined(__powerpc64__) + return 1; +#elif defined(__x86_64__) + return 1; +#else + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int ret; + + if (!supported_arch()) + return 0; + + ret = run_test(testcases, ARRAY_SIZE(testcases)); + if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) + ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + return ret; +} diff --git a/tools/testing/selftests/x86/5lvl.c b/tools/testing/selftests/x86/5lvl.c deleted file mode 100644 index 2eafdcd4c2b3..000000000000 --- a/tools/testing/selftests/x86/5lvl.c +++ /dev/null @@ -1,177 +0,0 @@ -#include <stdio.h> -#include <sys/mman.h> - -#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) - -#define PAGE_SIZE 4096 -#define LOW_ADDR ((void *) (1UL << 30)) -#define HIGH_ADDR ((void *) (1UL << 50)) - -struct testcase { - void *addr; - unsigned long size; - unsigned long flags; - const char *msg; - unsigned int low_addr_required:1; - unsigned int keep_mapped:1; -}; - -static struct testcase testcases[] = { - { - .addr = NULL, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED)", - }, - { - .addr = (void*) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1)", - .keep_mapped = 1, - }, - { - .addr = (void*) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1) again", - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE / 2), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 2 * PAGE_SIZE / 2)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap((1UL << 47) - PAGE_SIZE, 2 * PAGE_SIZE, MAP_FIXED)", - }, - { - .addr = NULL, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", - }, - { - .addr = (void*) -1, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = (void*) -1, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB) again", - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 4UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 4UL << 20, MAP_HUGETLB)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - (2UL << 20)), - .size = 4UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap((1UL << 47) - (2UL << 20), 4UL << 20, MAP_FIXED | MAP_HUGETLB)", - }, -}; - -int main(int argc, char **argv) -{ - int i; - void *p; - - for (i = 0; i < ARRAY_SIZE(testcases); i++) { - struct testcase *t = testcases + i; - - p = mmap(t->addr, t->size, PROT_NONE, t->flags, -1, 0); - - printf("%s: %p - ", t->msg, p); - - if (p == MAP_FAILED) { - printf("FAILED\n"); - continue; - } - - if (t->low_addr_required && p >= (void *)(1UL << 47)) - printf("FAILED\n"); - else - printf("OK\n"); - if (!t->keep_mapped) - munmap(p, t->size); - } - return 0; -} diff --git a/tools/usb/Makefile b/tools/usb/Makefile index 4e6506078494..01d758d73b6d 100644 --- a/tools/usb/Makefile +++ b/tools/usb/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for USB tools -CC = $(CROSS_COMPILE)gcc PTHREAD_LIBS = -lpthread WARNINGS = -Wall -Wextra CFLAGS = $(WARNINGS) -g -I../include diff --git a/tools/vm/Makefile b/tools/vm/Makefile index be320b905ea7..20f6cf04377f 100644 --- a/tools/vm/Makefile +++ b/tools/vm/Makefile @@ -6,7 +6,6 @@ TARGETS=page-types slabinfo page_owner_sort LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a -CC = $(CROSS_COMPILE)gcc CFLAGS = -Wall -Wextra -I../lib/ LDFLAGS = $(LIBS) diff --git a/tools/wmi/Makefile b/tools/wmi/Makefile index e664f1167388..e0e87239126b 100644 --- a/tools/wmi/Makefile +++ b/tools/wmi/Makefile @@ -2,7 +2,6 @@ PREFIX ?= /usr SBINDIR ?= sbin INSTALL ?= install CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include -CC = $(CROSS_COMPILE)gcc TARGET = dell-smbios-example diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a1037659c4fa..09f693db030e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -470,6 +470,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |